Ejemplo n.º 1
0
    def _load_resource_configs(self):

        self.is_valid()

        self._prof.prof('config_parser_start', uid=self._uid)

        # Loading all "default" resource configurations
        module_path  = os.path.dirname(os.path.abspath(__file__))
        default_cfgs = "%s/configs/resource_*.json" % module_path
        config_files = glob.glob(default_cfgs)

        for config_file in config_files:

            try:
                self._log.info("Load resource configurations from %s" % config_file)
                rcs = ResourceConfig.from_file(config_file)
            except Exception as e:
                self._log.exception("skip config file %s: %s" % (config_file, e))
                raise RuntimeError('config error (%s) - abort' % e)

            for rc in rcs:
                self._log.info("Load resource configurations for %s" % rc)
                self._resource_configs[rc] = rcs[rc].as_dict() 

        home         = os.environ.get('HOME', '')
        user_cfgs    = "%s/.radical/pilot/configs/resource_*.json" % home
        config_files = glob.glob(user_cfgs)

        for config_file in config_files:

            try:
                rcs = ResourceConfig.from_file(config_file)
            except Exception as e:
                self._log.exception("skip config file %s: %s" % (config_file, e))
                raise RuntimeError('config error (%s) - abort' % e)

            for rc in rcs:
                self._log.info("Load resource configurations for %s" % rc)

                if rc in self._resource_configs:
                    # config exists -- merge user config into it
                    ru.dict_merge(self._resource_configs[rc],
                                  rcs[rc].as_dict(),
                                  policy='overwrite')
                else:
                    # new config -- add as is
                    self._resource_configs[rc] = rcs[rc].as_dict() 

        default_aliases = "%s/configs/resource_aliases.json" % module_path
        self._resource_aliases = ru.read_json_str(default_aliases)['aliases']

        # check if we have aliases to merge
        usr_aliases = '%s/.radical/pilot/configs/resource_aliases.json' % home
        if os.path.isfile(usr_aliases):
            ru.dict_merge(self._resource_aliases,
                          ru.read_json_str(usr_aliases).get('aliases', {}),
                          policy='overwrite')

        self._prof.prof('config_parser_stop', uid=self._uid)
Ejemplo n.º 2
0
    def load_resource_config_file_json(self, config_file):
        """Load all resource logins from one config file to a dict.
        """
        _valid_p     = ['cluster_type', 'login_server', 'username',
                        'password', 'key_file', "config"]
        rcfgs = {}

        try:
            rcf_dict = ru.read_json_str(config_file)

            for res_name, cfg in rcf_dict.iteritems():

                invalid_cfg = False
                for k in ["bundle_agent", "ssh"]:
                    if k not in cfg:
                        invalid_cfg = True
                if invalid_cfg is True:
                    continue

                if cfg["bundle_agent"] is True:
                    if "cluster_type" not in cfg:
                        continue
                    elif cfg["cluster_type"] not in BundleAgent.supported_types:
                        continue
                    else:
                        for k in ["login_server", "username"]:
                            if k not in cfg["ssh"]:
                                invalid_cfg=True
                if invalid_cfg is True:
                    continue

                rcfgs[res_name] = cfg

        except ValueError, err:
            raise BadParameter("Couldn't parse resource configuration file '%s': %s." % (filename, str(err)))
Ejemplo n.º 3
0
def cfg(configs):

    fname = './configs/%s' % configs

    if  not os.path.exists(fname):
        raise Exception('no such config [%s]' % fname)

    cfg = ru.read_json_str(fname)

    assert    ('rs.tests' in cfg)
    return cfg['rs.tests']
Ejemplo n.º 4
0
    def run_bw_client(self, src, dst, port):
        cfg = self._endpoints[src]
        # tmp solution

        REMOTE_JOB_ENDPOINT  = "ssh://"  + cfg["ssh"]["login_server"]
        # TODO add session id
        # TODO reuse session
        REMOTE_DIR = "sftp://" + cfg["ssh"]["login_server"] + "/tmp/aimes.bundle/iperf/"

        # set Context
        ctx = saga.Context("ssh")
        ctx.user_id  = cfg["ssh"]["username"]

        # set Session
        session = saga.Session(default=False)
        session.add_context(ctx)

        with cfg["lock"]:
            # staging necessary files
            # remote mkdir
            workdir   = saga.filesystem.Directory(REMOTE_DIR, saga.filesystem.CREATE_PARENTS, session=session)
            file1 = saga.filesystem.File(
                    'file://localhost/%s/run-iperf-client.sh' % os.path.dirname(__file__), session=session)
            file1.copy(workdir.get_url())

            # create a remote job service
            js = saga.job.Service(REMOTE_JOB_ENDPOINT, session=session)

            # create job description
            jd = saga.job.Description()
            jd.working_directory = workdir.get_url().path
            jd.executable    = 'sh'
            jd.arguments     = ["run-iperf-client.sh", src, dst, port]

            myjob = js.create_job(jd)
            myjob.run()

            myjob.wait()
            print "Job State : %s" % (myjob.state)
            print "Exitcode  : %s" % (myjob.exit_code)

            result_file = "{}-{}.json".format(src, dst)
            workdir.copy( result_file, 'file://localhost/%s/' % os.getcwd() )
            try:
                result_dict = ru.read_json_str(result_file)
                return result_dict["end"]["sum_sent"]["bits_per_second"], \
                        result_dict["end"]["sum_received"]["bits_per_second"]

            except Exception as e:
                logging.exception("Parsing iperf result failed.")
                return None, None
Ejemplo n.º 5
0
    def __init__(self, agent_name):

        assert(agent_name != 'agent_0'), 'expect subagent, not agent_0'
        print "startup agent %s" % agent_name

        # load config, create session and controller, init rpu.Worker
        agent_cfg  = "%s/%s.cfg" % (os.getcwd(), agent_name)
        cfg        = ru.read_json_str(agent_cfg)

        self._uid         = agent_name
        self._pid         = cfg['pilot_id']
        self._sid         = cfg['session_id']
        self._final_cause = None

        # Create a session.  
        #
        # This session will not connect to MongoDB, but will create any
        # communication channels and components/workers specified in the 
        # config -- we merge that information into our own config.
        # We don't want the session to start components though, so remove them
        # from the config copy.
        session_cfg = copy.deepcopy(cfg)
        session_cfg['owner']      = self._uid
        session_cfg['components'] = dict()
        session = rp_Session(cfg=session_cfg, _connect=False, uid=self._sid)

        # we still want the bridge addresses known though, so make sure they are
        # merged into our own copy, along with any other additions done by the
        # session.
        ru.dict_merge(cfg, session._cfg, ru.PRESERVE)
        pprint.pprint(cfg)

        if session.is_connected:
            raise RuntimeError('agent_n should not connect to mongodb')

        # at this point the session is up and workin, and the session
        # controller should have brought up all communication bridges and the
        # agent components.  We are ready to roll!
        rpu.Worker.__init__(self, cfg, session)
Ejemplo n.º 6
0
        # it will thus *not* clean out the session's database record (s that is
        # used for some statistics post-mortem), but will kill all remaining
        # pilots.


#-------------------------------------------------------------------------------
#
if __name__ == "__main__":

    # TODO: the json config should be converted into an mpi_test kernel, once
    # the application kernels become maintainable...

    pwd = os.path.dirname(__file__)
    if not pwd:
        pwd = '.'
    configs = ru.read_json_str('%s/test.json' % pwd)
    targets = sys.argv[1:]
    failed = 0

    if not targets:
        print "\n\n\tusage: %s <target> [target] ...\n\n"
        sys.exit(-1)

    for target in targets:

        if not target in configs:
            print 'no config found for %s' % target
            print 'known targets: %s' % ', '.join(configs.keys())
            continue

        cfg = configs[target]
Ejemplo n.º 7
0
def test_read_json():
    '''
    Test json parser
    '''

    # --------------------------------------------------------------------------
    # default xcase
    data = {'test_1': 1,
            'test_2': 'one',
            'test_3': [1, 'one']}

    filename  = _write_json(json.dumps(data))
    data_copy = ru.read_json(filename)

    assert(data_copy)

    for key in data:
        assert(key in data_copy)
        assert(data[key] == data_copy[key])

    for key in data_copy:
        assert(key in data)
        assert(data[key] == data_copy[key])


    # ---------------------------------------------------------------------------
    # string read
    data_copy = ru.read_json_str(filename)
    assert(isinstance(data_copy['test_2'], str))


    # ---------------------------------------------------------------------------
    # arg switching
    ru.write_json(filename, data_copy)
    ru.write_json(data_copy, filename)
    data_copy = ru.read_json_str(filename)
    assert(len(data_copy) == 3)

    os.unlink(filename)


    # --------------------------------------------------------------------------
    # manual parse
    data = '''{
                  "test_1": 1,
                  "test_2": "one",
                  "test_3": [1, "one"]
              }'''
    data_copy = ru.parse_json(data, filter_comments=False)
    assert(len(data_copy) == 3)
    assert(data_copy['test_2'] == 'one')


    # --------------------------------------------------------------------------
    # forced str conversion on manual parse
    data_copy = ru.parse_json_str(data)
    assert(len(data_copy) == 3)
    assert(isinstance(data_copy['test_2'], str))


    # ---------------------------------------------------------------------------
    # faulty json file
    filename = _write_raw(b'{"foo": [False]}')
    with pytest.raises(ValueError):
        ru.read_json(filename)
Ejemplo n.º 8
0
    def __init__ (self, database_url=None, database_name="radicalpilot",
                  uid=None, name=None):
        """Creates a new or reconnects to an exising session.

        If called without a uid, a new Session instance is created and 
        stored in the database. If uid is set, an existing session is 
        retrieved from the database. 

        **Arguments:**
            * **database_url** (`string`): The MongoDB URL.  If none is given,
              RP uses the environment variable RADICAL_PILOT_DBURL.  If that is
              not set, an error will be raises.

            * **database_name** (`string`): An alternative database name 
              (default: 'radicalpilot').

            * **uid** (`string`): If uid is set, we try 
              re-connect to an existing session instead of creating a new one.

            * **name** (`string`): An optional human readable name.

        **Returns:**
            * A new Session instance.

        **Raises:**
            * :class:`radical.pilot.DatabaseError`

        """

        # init the base class inits
        saga.Session.__init__ (self)
        Object.__init__ (self)

        # before doing anything else, set up the debug helper for the lifetime
        # of the session.
        self._debug_helper = ru.DebugHelper ()

        # Dictionaries holding all manager objects created during the session.
        self._pilot_manager_objects = list()
        self._unit_manager_objects = list()

        # Create a new process registry. All objects belonging to this 
        # session will register their worker processes (if they have any)
        # in this registry. This makes it easier to shut down things in 
        # a more coordinate fashion. 
        self._process_registry = _ProcessRegistry()

        # The resource configuration dictionary associated with the session.
        self._resource_configs = {}

        self._database_url  = database_url
        self._database_name = database_name 

        if  not self._database_url :
            self._database_url = os.getenv ("RADICAL_PILOT_DBURL", None)

        if  not self._database_url :
            raise PilotException ("no database URL (set RADICAL_PILOT_DBURL)")  

        logger.info("using database url  %s" % self._database_url)

        # if the database url contains a path element, we interpret that as
        # database name (without the leading slash)
        tmp_url = ru.Url (self._database_url)
        if  tmp_url.path            and \
            tmp_url.path[0]  == '/' and \
            len(tmp_url.path) >  1  :
            self._database_name = tmp_url.path[1:]
            logger.info("using database path %s" % self._database_name)
        else :
            logger.info("using database name %s" % self._database_name)

        # Loading all "default" resource configurations
        module_path   = os.path.dirname(os.path.abspath(__file__))
        default_cfgs  = "%s/configs/*.json" % module_path
        config_files  = glob.glob(default_cfgs)

        for config_file in config_files:

            try :
                rcs = ResourceConfig.from_file(config_file)
            except Exception as e :
                logger.error ("skip config file %s: %s" % (config_file, e))
                continue

            for rc in rcs:
                logger.info("Loaded resource configurations for %s" % rc)
                self._resource_configs[rc] = rcs[rc].as_dict() 

        user_cfgs     = "%s/.radical/pilot/configs/*.json" % os.environ.get ('HOME')
        config_files  = glob.glob(user_cfgs)

        for config_file in config_files:

            try :
                rcs = ResourceConfig.from_file(config_file)
            except Exception as e :
                logger.error ("skip config file %s: %s" % (config_file, e))
                continue

            for rc in rcs:
                logger.info("Loaded resource configurations for %s" % rc)

                if  rc in self._resource_configs :
                    # config exists -- merge user config into it
                    ru.dict_merge (self._resource_configs[rc],
                                   rcs[rc].as_dict(),
                                   policy='overwrite')
                else :
                    # new config -- add as is
                    self._resource_configs[rc] = rcs[rc].as_dict() 

        default_aliases = "%s/configs/aliases.json" % module_path
        self._resource_aliases = ru.read_json_str (default_aliases)['aliases']

        ##########################
        ## CREATE A NEW SESSION ##
        ##########################
        if uid is None:
            try:
                self._connected  = None

                if name :
                    self._name = name
                    self._uid  = name
                  # self._uid  = ru.generate_id ('rp.session.'+name+'.%(item_counter)06d', mode=ru.ID_CUSTOM)
                else :
                    self._uid  = ru.generate_id ('rp.session', mode=ru.ID_PRIVATE)
                    self._name = self._uid


                self._dbs, self._created, self._connection_info = \
                        dbSession.new(sid     = self._uid,
                                      name    = self._name,
                                      db_url  = self._database_url,
                                      db_name = database_name)

                logger.info("New Session created%s." % str(self))

            except Exception, ex:
                logger.exception ('session create failed')
                raise PilotException("Couldn't create new session (database URL '%s' incorrect?): %s" \
                                % (self._database_url, ex))  
Ejemplo n.º 9
0
""")

    configs = os.listdir(
        "{0}/../../src/radical/pilot/configs/".format(script_dir))
    for config in configs:

        if config.endswith(".json") is False:
            continue  # skip all non-python files

        if config.startswith("aliases") is True:
            continue  # skip alias files

        print " * %s" % config

        try:
            json_data = ru.read_json_str("../../src/radical/pilot/configs/%s" %
                                         config)
        except Exception, ex:
            print "    * JSON PARSING ERROR: %s" % str(ex)
            continue

        resources_rst.write("{0}\n".format(config[:-5].upper()))
        resources_rst.write("{0}\n\n".format("=" * len(config[:-5])))

        for host_key, resource_config in json_data.iteritems():
            resource_key = "%s.%s" % (config[:-5], host_key)
            print "   * %s" % resource_key
            try:
                default_queue = resource_config["default_queue"]
            except Exception, ex:
                default_queue = None
Ejemplo n.º 10
0
""")

    configs = os.listdir("{0}/../../src/radical/pilot/configs/".format(script_dir))
    for config in configs:

        if config.endswith(".json") is False:
            continue # skip all non-python files

        if config.startswith("aliases") is True:
            continue # skip alias files

        print " * %s" % config

        try: 
             json_data = ru.read_json_str("../../src/radical/pilot/configs/%s" % config)
        except Exception, ex:
             print "    * JSON PARSING ERROR: %s" % str(ex)
             continue

        resources_rst.write("{0}\n".format(config[:-5].upper()))
        resources_rst.write("{0}\n\n".format("="*len(config[:-5])))

        for host_key, resource_config in json_data.iteritems():
            resource_key = "%s.%s" % (config[:-5], host_key)
            print "   * %s" % resource_key
            try:
                default_queue = resource_config["default_queue"]
            except Exception, ex:
                default_queue = None
Ejemplo n.º 11
0
#
# {
#     "scheduler"    : "rp.SCHED_BACKFILLING",
#     "resources"    : ["india.furturegrid.org", "sierra.futuregrid.org"],
#     "resource_cfg" :
#     {
#         "*.futuregrid.org" :
#         {
#             "username"      : "merzky"
#         }
#     }
# }
USER_CONFIG_PATH = os.environ.get('HOME', '/tmp') + '/.my_app.cfg'

# load the user config, and merge it with the default config
user_config = ru.read_json_str(USER_CONFIG_PATH)

# merge the user config into the app config, so that the user config keys are
# applied where appropriate
ru.dict_merge(app_config, user_config, policy='overwrite', wildcards=True)

# lets see what we got
pprint.pprint(app_config)

# this should result in :
#
# {
#     'log_level'   : 0,
#     'scheduler'   : 'rp.SCHED_BACKFILLING',
#     'resources'   : ['india.furturegrid.org', 'sierra.futuregrid.org'],
#     'resource_cfg':
Ejemplo n.º 12
0
    import glob
    configs = glob.glob("%s/../../src/radical/pilot/configs/resource_*.json" %
                        script_dir)
    for config in configs:

        if not config.endswith(".json"):
            continue  # skip all non-python files

        if "/resource_aliases" in config:
            continue  # skip alias files

        print " * %s" % config

        try:
            json_data = ru.read_json_str(config)
        except Exception, ex:
            print "    * JSON PARSING ERROR: %s" % str(ex)
            continue

        config = config.split('/')[-1]

        resources_rst.write("{0}\n".format(config[:-5].upper()))
        resources_rst.write("{0}\n\n".format("=" * len(config[:-5])))

        for host_key, resource_config in json_data.iteritems():
            resource_key = "%s.%s" % (config[:-5], host_key)
            print "   * %s" % resource_key
            try:
                default_queue = resource_config["default_queue"]
            except Exception, ex:
Ejemplo n.º 13
0
    def __init__(self, agent_name):

        assert(agent_name == 'agent_0'), 'expect agent_0, not subagent'
        print 'startup agent %s' % agent_name

        # load config, create session, init rpu.Worker
        agent_cfg  = '%s/%s.cfg' % (os.getcwd(), agent_name)
        cfg        = ru.read_json_str(agent_cfg)

        cfg['agent_name'] = agent_name

        self._uid         = agent_name
        self._pid         = cfg['pilot_id']
        self._sid         = cfg['session_id']
        self._runtime     = cfg['runtime']
        self._starttime   = time.time()
        self._final_cause = None
        self._lrms        = None

        # this better be on a shared FS!
        cfg['workdir']    = os.getcwd()

        # sanity check on config settings
        if not 'cores'               in cfg: raise ValueError('Missing number of cores')
        if not 'lrms'                in cfg: raise ValueError('Missing LRMS')
        if not 'dburl'               in cfg: raise ValueError('Missing DBURL')
        if not 'pilot_id'            in cfg: raise ValueError('Missing pilot id')
        if not 'runtime'             in cfg: raise ValueError('Missing or zero agent runtime')
        if not 'scheduler'           in cfg: raise ValueError('Missing agent scheduler')
        if not 'session_id'          in cfg: raise ValueError('Missing session id')
        if not 'spawner'             in cfg: raise ValueError('Missing agent spawner')
        if not 'task_launch_method'  in cfg: raise ValueError('Missing unit launch method')

        # Check for the RADICAL_PILOT_DB_HOSTPORT env var, which will hold
        # the address of the tunnelized DB endpoint. If it exists, we
        # overrule the agent config with it.
        hostport = os.environ.get('RADICAL_PILOT_DB_HOSTPORT')
        if hostport:
            dburl = ru.Url(cfg['dburl'])
            dburl.host, dburl.port = hostport.split(':')
            cfg['dburl'] = str(dburl)

        # Create a session.
        #
        # This session will connect to MongoDB, and will also create any
        # communication channels and components/workers specified in the
        # config -- we merge that information into our own config.
        # We don't want the session to start components though, so remove them
        # from the config copy.
        session_cfg = copy.deepcopy(cfg)
        session_cfg['components'] = dict()
        session = rp_Session(cfg=session_cfg, uid=self._sid)

        # we still want the bridge addresses known though, so make sure they are
        # merged into our own copy, along with any other additions done by the
        # session.
        ru.dict_merge(cfg, session._cfg, ru.PRESERVE)
        pprint.pprint(cfg)

        if not session.is_connected:
            raise RuntimeError('agent_0 could not connect to mongodb')

        # at this point the session is up and connected, and it should have
        # brought up all communication bridges and the UpdateWorker.  We are
        # ready to rumble!
        rpu.Worker.__init__(self, cfg, session)

        # this is the earlier point to sync bootstrapper and agent # profiles
        self._prof.prof('sync_rel', msg='agent_0 start', uid=self._pid)

        # Create LRMS which will give us the set of agent_nodes to use for
        # sub-agent startup.  Add the remaining LRMS information to the
        # config, for the benefit of the scheduler).
        self._lrms = rpa_rm.RM.create(name=self._cfg['lrms'], cfg=self._cfg,
                                      session=self._session)

        # add the resource manager information to our own config
        self._cfg['lrms_info'] = self._lrms.lrms_info
Ejemplo n.º 14
0
        # it will thus both clean out the session's database record, and kill
        # all remaining pilots (none in our example).


#------------------------------------------------------------------------------
#
if __name__ == "__main__":

    # TODO: the json config should be converted into an mpi_test kernel, once
    # the application kernels become maintainable...
    print __file__

    pwd     = os.path.dirname(__file__)
    if not pwd:
        pwd = '.'
    configs = ru.read_json_str ('%s/test.json' % pwd)
    targets = sys.argv[1:]
    failed  = 0

    if not targets:
        print "\n\n\tusage: %s <target> [target] ...\n\n"
        sys.exit (-1)


    for target in targets:

        if not target in configs:
            print 'no config found for %s' % target
            print 'known targets: %s' % ', '.join (configs.keys())
            continue
        
def bootstrap_3():
    """
    This method continues where the bootstrapper left off, but will quickly pass
    control to the Agent class which will spawn the functional components.

    Most of bootstrap_3 applies only to agent_0, in particular all mongodb
    interactions remains excluded for other sub-agent instances.

    The agent interprets a config file, which will specify in an agent_layout
    section:
      - what nodes should be used for sub-agent startup
      - what bridges should be started
      - what components should be started
      - what are the endpoints for bridges which are not started
    bootstrap_3 will create derived config files for all sub-agents.

    The agent master (agent_0) will collect information about the nodes required
    for all instances.  That is added to the config itself, for the benefit of
    the LRMS initialisation which is expected to block those nodes from the
    scheduler.
    """

    global lrms, agent, bridges

    # find out what agent instance name we have
    if len(sys.argv) != 2:
        raise RuntimeError("invalid number of parameters (%s)" % sys.argv)
    agent_name = sys.argv[1]

    # load the agent config, and overload the config dicts
    agent_cfg = "%s/%s.cfg" % (os.getcwd(), agent_name)
    print "startup agent %s : %s" % (agent_name, agent_cfg)

    cfg = ru.read_json_str(agent_cfg)
    cfg["agent_name"] = agent_name
    pilot_id = cfg["pilot_id"]

    # set up a logger and profiler
    prof = ru.Profiler("%s.bootstrap_3" % agent_name)
    prof.prof("sync ref", msg="agent start", uid=pilot_id)
    log = ru.get_logger("%s.bootstrap_3" % agent_name, "%s.bootstrap_3.log" % agent_name, "DEBUG")  # FIXME?
    log.info("start")
    prof.prof("sync ref", msg="agent start")

    try:
        import setproctitle as spt

        spt.setproctitle("radical.pilot %s" % agent_name)
    except Exception as e:
        log.debug("no setproctitle: %s", e)

    log.setLevel(cfg.get("debug", "INFO"))

    print "Agent config (%s):\n%s\n\n" % (agent_cfg, pprint.pformat(cfg))

    # quickly set up a mongodb handle so that we can report errors.
    # FIXME: signal handlers need mongo_p, but we won't have that until later
    if agent_name == "agent_0":

        # Check for the RADICAL_PILOT_DB_HOSTPORT env var, which will hold the
        # address of the tunnelized DB endpoint.
        # If it exists, we overrule the agent config with it.
        hostport = os.environ.get("RADICAL_PILOT_DB_HOSTPORT")
        if hostport:
            dburl = ru.Url(cfg["mongodb_url"])
            dburl.host, dburl.port = hostport.split(":")
            cfg["mongodb_url"] = str(dburl)

        _, mongo_db, _, _, _ = ru.mongodb_connect(cfg["mongodb_url"])
        mongo_p = mongo_db["%s.p" % cfg["session_id"]]

        if not mongo_p:
            raise RuntimeError("could not get a mongodb handle")

    # set up signal and exit handlers
    def exit_handler():
        global lrms, agent, bridges

        print "atexit"
        if lrms:
            lrms.stop()
            lrms = None
        if bridges:
            for b in bridges:
                b.stop()
            bridges = dict()
        if agent:
            agent.stop()
            agent = None
        sys.exit(1)

    def sigint_handler(signum, frame):
        if agent_name == "agent_0":
            pilot_FAILED(msg="Caught SIGINT. EXITING (%s)" % frame)
        print "sigint"
        prof.prof("stop", msg="sigint_handler", uid=pilot_id)
        prof.close()
        sys.exit(2)

    def sigterm_handler(signum, frame):
        if agent_name == "agent_0":
            pilot_FAILED(msg="Caught SIGTERM. EXITING (%s)" % frame)
        print "sigterm"
        prof.prof("stop", msg="sigterm_handler %s" % os.getpid(), uid=pilot_id)
        prof.close()
        sys.exit(3)

    def sigalarm_handler(signum, frame):
        if agent_name == "agent_0":
            pilot_FAILED(msg="Caught SIGALRM (Walltime limit?). EXITING (%s)" % frame)
        print "sigalrm"
        prof.prof("stop", msg="sigalarm_handler", uid=pilot_id)
        prof.close()
        sys.exit(4)

    import atexit

    atexit.register(exit_handler)
    signal.signal(signal.SIGINT, sigint_handler)
    signal.signal(signal.SIGTERM, sigterm_handler)
    signal.signal(signal.SIGALRM, sigalarm_handler)

    # if anything went wrong up to this point, we would have been unable to
    # report errors into mongodb.  From here on, any fatal error should result
    # in one of the above handlers or exit handlers being activated, thus
    # reporting the error dutifully.

    try:
        # ----------------------------------------------------------------------
        # des Pudels Kern: merge LRMS info into cfg and get the agent started

        if agent_name == "agent_0":

            # only the master agent creates LRMS and sub-agent config files.
            # The LRMS which will give us the set of agent_nodes to use for
            # sub-agent startup.  Add the remaining LRMS information to the
            # config, for the benefit of the scheduler).

            lrms = rp.agent.RM.create(name=cfg["lrms"], cfg=cfg, logger=log)
            cfg["lrms_info"] = lrms.lrms_info

            # the master agent also is the only one which starts bridges.  It
            # has to do so before creating the Agent Worker instance, as that is
            # using the bridges already.

            bridges = start_bridges(cfg, log)
            # FIXME: make sure all communication channels are in place.  This could
            # be replaced with a proper barrier, but not sure if that is worth it...
            time.sleep(1)

            # after we started bridges, we'll add their in and out addresses
            # to the config, so that the communication channels can connect to
            # them.  At this point we also write configs for all sub-agents this
            # instance intents to spawn.
            #
            # FIXME: we should point the address to the node of the subagent
            #        which hosts the bridge, not the local IP.  Until this
            #        is fixed, bridges MUST run on agent_0 (which is what
            #        RM.hostip() below will point to).
            nodeip = rp.agent.RM.hostip(cfg.get("network_interface"), logger=log)
            write_sub_configs(cfg, bridges, nodeip, log)

            # Store some runtime information into the session
            mongo_p.update(
                {"_id": pilot_id},
                {"$set": {"lm_info": lrms.lm_info.get("version_info"), "lm_detail": lrms.lm_info.get("lm_detail")}},
            )

        # we now have correct bridge addresses added to the agent_0.cfg, and all
        # other agents will have picked that up from their config files -- we
        # can start the agent and all its components!
        agent = rp.worker.Agent(cfg)
        agent.start()

        log.debug("waiting for agent %s to join" % agent_name)
        agent.join()
        log.debug("agent %s joined" % agent_name)

        # ----------------------------------------------------------------------

    except SystemExit:
        log.exception("Exit running agent: %s" % agent_name)
        if agent and not agent.final_cause:
            agent.final_cause = "sys.exit"

    except Exception as e:
        log.exception("Error running agent: %s" % agent_name)
        if agent and not agent.final_cause:
            agent.final_cause = "error"

    finally:

        # in all cases, make sure we perform an orderly shutdown.  I hope python
        # does not mind doing all those things in a finally clause of
        # (essentially) main...
        if agent:
            agent.stop()
        log.debug("agent %s finalized" % agent_name)

        # agent.stop will not tear down bridges -- we do that here at last
        for name, b in bridges.items():
            try:
                log.info("closing bridge %s", b)
                b["handle"].stop()
            except Exception as e:
                log.exception("ignore failing bridge terminate (%s)", e)
        bridges = dict()

        # make sure the lrms release whatever it acquired
        if lrms:
            lrms.stop()
            lrms = None

        # agent_0 will also report final pilot state to the DB
        if agent_name == "agent_0":
            if agent and agent.final_cause == "timeout":
                pilot_DONE(mongo_p, pilot_id, log, "TIMEOUT received. Terminating.")
            elif agent and agent.final_cause == "cancel":
                pilot_CANCELED(mongo_p, pilot_id, log, "CANCEL received. Terminating.")
            elif agent and agent.final_cause == "sys.exit":
                pilot_CANCELED(mongo_p, pilot_id, log, "EXIT received. Terminating.")
            elif agent and agent.final_cause == "finalize":
                log.info("shutdown due to component finalization -- assuming error")
                pilot_FAILED(mongo_p, pilot_id, log, "FINALIZE received")
            elif agent:
                pilot_FAILED(mongo_p, pilot_id, log, "TERMINATE received")
            else:
                pilot_FAILED(mongo_p, pilot_id, log, "FAILED startup")

        log.info("stop")
        prof.prof("stop", msg="finally clause agent", uid=pilot_id)
        prof.close()
Ejemplo n.º 16
0
    def __init__(self, agent_name):

        assert(agent_name == 'agent_0'), 'expect agent_0, not subagent'
        print 'startup agent %s' % agent_name

        # load config, create session, init rpu.Worker
        agent_cfg  = '%s/%s.cfg' % (os.getcwd(), agent_name)
        cfg        = ru.read_json_str(agent_cfg)

        cfg['agent_name'] = agent_name

        self._uid         = agent_name
        self._pid         = cfg['pilot_id']
        self._sid         = cfg['session_id']
        self._runtime     = cfg['runtime']
        self._starttime   = time.time()
        self._final_cause = None
        self._lrms        = None

        # this better be on a shared FS!
        cfg['workdir']    = os.getcwd()

        # sanity check on config settings
        if 'cores'               not in cfg: raise ValueError('Missing number of cores')
        if 'lrms'                not in cfg: raise ValueError('Missing LRMS')
        if 'dburl'               not in cfg: raise ValueError('Missing DBURL')
        if 'pilot_id'            not in cfg: raise ValueError('Missing pilot id')
        if 'runtime'             not in cfg: raise ValueError('Missing or zero agent runtime')
        if 'scheduler'           not in cfg: raise ValueError('Missing agent scheduler')
        if 'session_id'          not in cfg: raise ValueError('Missing session id')
        if 'spawner'             not in cfg: raise ValueError('Missing agent spawner')
        if 'task_launch_method'  not in cfg: raise ValueError('Missing unit launch method')

        # Check for the RADICAL_PILOT_DB_HOSTPORT env var, which will hold
        # the address of the tunnelized DB endpoint. If it exists, we
        # overrule the agent config with it.
        hostport = os.environ.get('RADICAL_PILOT_DB_HOSTPORT')
        if hostport:
            dburl = ru.Url(cfg['dburl'])
            dburl.host, dburl.port = hostport.split(':')
            cfg['dburl'] = str(dburl)

        # Create a session.
        #
        # This session will connect to MongoDB, and will also create any
        # communication channels and components/workers specified in the
        # config -- we merge that information into our own config.
        # We don't want the session to start components though, so remove them
        # from the config copy.        
        session_cfg = copy.deepcopy(cfg)
        session_cfg['components'] = dict()
        session = rp_Session(cfg=session_cfg, uid=self._sid)

        # we still want the bridge addresses known though, so make sure they are
        # merged into our own copy, along with any other additions done by the
        # session.
        ru.dict_merge(cfg, session._cfg, ru.PRESERVE)
        pprint.pprint(cfg)

        if not session.is_connected:
            raise RuntimeError('agent_0 could not connect to mongodb')

        # at this point the session is up and connected, and it should have
        # brought up all communication bridges and the UpdateWorker.  We are
        # ready to rumble!
        rpu.Worker.__init__(self, cfg, session)

        # this is the earlier point to sync bootstrapper and agent # profiles
        self._prof.prof('sync_rel', msg='agent_0 start', uid=self._pid)

        # Create LRMS which will give us the set of agent_nodes to use for
        # sub-agent startup.  Add the remaining LRMS information to the
        # config, for the benefit of the scheduler).
        self._lrms = rpa_rm.RM.create(name=self._cfg['lrms'], cfg=self._cfg,
                                      session=self._session)

        # add the resource manager information to our own config
        self._cfg['lrms_info'] = self._lrms.lrms_info
Ejemplo n.º 17
0
def get_profiles (command, tags=None, url=None, mode=None) :

    print command

    if not url:
        url = os.environ.get ('RADICAL_SYNAPSE_DBURL')

    if not url:
        print "warning: need dburl to retrieve profiles"
        return None

    url = ru.Url(url)

    if mode and not isinstance (mode, list):
        mode = [mode]

    if not tags:
        tags  = dict()
        elems = filter (None, os.environ.get('RADICAL_SYNAPSE_TAGS', '').split(','))
        for elem in elems:
            if ':' in elem:
                key, val  = elem.split(':', 1)
                tags[key] = val
            else:
                tags[elem] = None


    command_idx = index_command (command, tags)

    if url.schema == 'mongodb':

        [dbhost, port, dbname, _, _, _, _] = ru.split_dburl (url)

        db_client  = pymongo.MongoClient (host=dbhost, port=port)
        database   = db_client[dbname]
        collection = database['profiles']

        # FIXME: eval partial tags

        if mode:
            results = collection.find ({'type'        : 'synapse_profile',
                                        'tags'        : tags,
                                        'mode'        : {'$in': mode},  # FIXME: check
                                        'command_idx' : command_idx})
        else:
            results = collection.find ({'type'        : 'synapse_profile',
                                        'tags'        : tags,
                                        'command_idx' : command_idx})

        if  not results.count() :
            raise RuntimeError ("Could not get profile for %s at %s/profiles"
                    % (command, url))

        ret = list(results)


    elif url.schema == 'file':

        path = url.path

        if not os.path.isdir (path):
            raise ValueError ("dburl (%s) must point to an existing dir" % url)

        name = command_idx.split()[0]
      # for key, val in tags.iteritems():
      #     if val != None: name += "_%s:%s" % (key, val)
      #     else          : name += "_%s"    % (key)
        for tag in sorted(tags.keys()):
            if tags[tag] != None: name += "_%s" % tags[tag]
            else                : name += "_%s" % tag

      # print    "checking profiles %s/synapse_profile_%s_*.json" % (path, name)
        base   = "%s/synapse_profile_%s_*.json" % (path, name)
        fnames = glob.glob (base)
        ret    = list()
        for fname in fnames:

          # print 'reading profile %s' % fname

            doc     = ru.read_json_str (fname)
            use     = False
            docmode = doc['mode'][0:3]

            doc['fname'] = fname

            if doc['command'] == command:
                if not mode :
                    use = True
                elif docmode in mode:
                    use = True
                else:
                    print "skip: mode %s not in %s" % (docmode, mode)
            else:
                print "skip command %s" % command
                print "   ! command %s" % doc['command']

            if use:
                ret.append (doc)

        if not len(ret):
            raise LookupError ("No matching profile at %s" % base)

  # print 'retrieved %d profiles from %s' % (len(ret), url)
  # pprint.pprint (ret)

    return ret
Ejemplo n.º 18
0
    def __init__(self,
                 database_url=None,
                 database_name="radicalpilot",
                 uid=None,
                 name=None):
        """Creates a new or reconnects to an exising session.

        If called without a uid, a new Session instance is created and 
        stored in the database. If uid is set, an existing session is 
        retrieved from the database. 

        **Arguments:**
            * **database_url** (`string`): The MongoDB URL.  If none is given,
              RP uses the environment variable RADICAL_PILOT_DBURL.  If that is
              not set, an error will be raises.

            * **database_name** (`string`): An alternative database name 
              (default: 'radicalpilot').

            * **uid** (`string`): If uid is set, we try 
              re-connect to an existing session instead of creating a new one.

            * **name** (`string`): An optional human readable name.

        **Returns:**
            * A new Session instance.

        **Raises:**
            * :class:`radical.pilot.DatabaseError`

        """

        # init the base class inits
        saga.Session.__init__(self)
        Object.__init__(self)

        # before doing anything else, set up the debug helper for the lifetime
        # of the session.
        self._debug_helper = ru.DebugHelper()

        # Dictionaries holding all manager objects created during the session.
        self._pilot_manager_objects = list()
        self._unit_manager_objects = list()

        # Create a new process registry. All objects belonging to this
        # session will register their worker processes (if they have any)
        # in this registry. This makes it easier to shut down things in
        # a more coordinate fashion.
        self._process_registry = _ProcessRegistry()

        # The resource configuration dictionary associated with the session.
        self._resource_configs = {}

        self._database_url = database_url
        self._database_name = database_name

        if not self._database_url:
            self._database_url = os.getenv("RADICAL_PILOT_DBURL", None)

        if not self._database_url:
            raise PilotException("no database URL (set RADICAL_PILOT_DBURL)")

        logger.info("using database url  %s" % self._database_url)

        # if the database url contains a path element, we interpret that as
        # database name (without the leading slash)
        tmp_url = ru.Url(self._database_url)
        if  tmp_url.path            and \
            tmp_url.path[0]  == '/' and \
            len(tmp_url.path) >  1  :
            self._database_name = tmp_url.path[1:]
            logger.info("using database path %s" % self._database_name)
        else:
            logger.info("using database name %s" % self._database_name)

        # Loading all "default" resource configurations
        module_path = os.path.dirname(os.path.abspath(__file__))
        default_cfgs = "%s/configs/*.json" % module_path
        config_files = glob.glob(default_cfgs)

        for config_file in config_files:

            try:
                rcs = ResourceConfig.from_file(config_file)
            except Exception as e:
                logger.error("skip config file %s: %s" % (config_file, e))
                continue

            for rc in rcs:
                logger.info("Loaded resource configurations for %s" % rc)
                self._resource_configs[rc] = rcs[rc].as_dict()

        user_cfgs = "%s/.radical/pilot/configs/*.json" % os.environ.get('HOME')
        config_files = glob.glob(user_cfgs)

        for config_file in config_files:

            try:
                rcs = ResourceConfig.from_file(config_file)
            except Exception as e:
                logger.error("skip config file %s: %s" % (config_file, e))
                continue

            for rc in rcs:
                logger.info("Loaded resource configurations for %s" % rc)

                if rc in self._resource_configs:
                    # config exists -- merge user config into it
                    ru.dict_merge(self._resource_configs[rc],
                                  rcs[rc].as_dict(),
                                  policy='overwrite')
                else:
                    # new config -- add as is
                    self._resource_configs[rc] = rcs[rc].as_dict()

        default_aliases = "%s/configs/aliases.json" % module_path
        self._resource_aliases = ru.read_json_str(default_aliases)['aliases']

        ##########################
        ## CREATE A NEW SESSION ##
        ##########################
        if uid is None:
            try:
                self._connected = None

                if name:
                    self._name = name
                    self._uid = name
                # self._uid  = ru.generate_id ('rp.session.'+name+'.%(item_counter)06d', mode=ru.ID_CUSTOM)
                else:
                    self._uid = ru.generate_id('rp.session',
                                               mode=ru.ID_PRIVATE)
                    self._name = self._uid


                self._dbs, self._created, self._connection_info = \
                        dbSession.new(sid     = self._uid,
                                      name    = self._name,
                                      db_url  = self._database_url,
                                      db_name = database_name)

                logger.info("New Session created%s." % str(self))

            except Exception, ex:
                logger.exception('session create failed')
                raise PilotException("Couldn't create new session (database URL '%s' incorrect?): %s" \
                                % (self._database_url, ex))
Ejemplo n.º 19
0
class Session(saga.Session):
    """A Session encapsulates a RADICAL-Pilot instance and is the *root* object
    for all other RADICAL-Pilot objects. 

    A Session holds :class:`radical.pilot.PilotManager` and :class:`radical.pilot.UnitManager`
    instances which in turn hold  :class:`radical.pilot.Pilot` and
    :class:`radical.pilot.ComputeUnit` instances.

    Each Session has a unique identifier :data:`radical.pilot.Session.uid` that can be
    used to re-connect to a RADICAL-Pilot instance in the database.

    **Example**::

        s1 = radical.pilot.Session(database_url=DBURL)
        s2 = radical.pilot.Session(database_url=DBURL, uid=s1.uid)

        # s1 and s2 are pointing to the same session
        assert s1.uid == s2.uid
    """

    #---------------------------------------------------------------------------
    #
    def __init__(self, database_url=None, database_name=None, name=None):
        """Creates a new session.

        If called without a uid, a new Session instance is created and 
        stored in the database. If uid is set, an existing session is 
        retrieved from the database. 

        **Arguments:**
            * **database_url** (`string`): The MongoDB URL.  If none is given,
              RP uses the environment variable RADICAL_PILOT_DBURL.  If that is
              not set, an error will be raises.

            * **database_name** (`string`): An alternative database name 
              (default: 'radicalpilot').

            * **uid** (`string`): If uid is set, we try 
              re-connect to an existing session instead of creating a new one.

            * **name** (`string`): An optional human readable name.

        **Returns:**
            * A new Session instance.

        **Raises:**
            * :class:`radical.pilot.DatabaseError`

        """

        logger = ru.get_logger('radical.pilot')

        if database_name:
            logger.error(
                "The 'database_name' parameter is deprecated - please specify an URL path"
            )
        else:
            database_name = 'radicalpilot'

        # init the base class inits
        saga.Session.__init__(self)
        self._dh = ru.DebugHelper()
        self._valid = True
        self._terminate = threading.Event()
        self._terminate.clear()

        # before doing anything else, set up the debug helper for the lifetime
        # of the session.
        self._debug_helper = ru.DebugHelper()

        # Dictionaries holding all manager objects created during the session.
        self._pilot_manager_objects = dict()
        self._unit_manager_objects = dict()

        # The resource configuration dictionary associated with the session.
        self._resource_configs = {}

        if not database_url:
            database_url = os.getenv("RADICAL_PILOT_DBURL", None)

        if not database_url:
            raise PilotException("no database URL (set RADICAL_PILOT_DBURL)")

        self._dburl = ru.Url(database_url)

        # if the database url contains a path element, we interpret that as
        # database name (without the leading slash)
        if  not self._dburl.path         or \
            self._dburl.path[0]   != '/' or \
            len(self._dburl.path) <=  1  :
            logger.error(
                "incomplete URLs are deprecated -- missing database name!")
            self._dburl.path = database_name  # defaults to 'radicalpilot'

        logger.info("using database %s" % self._dburl)

        # ----------------------------------------------------------------------
        # create new session
        try:
            if name:
                self._name = name
                self._uid = name
            # self._uid  = ru.generate_id ('rp.session.'+name+'.%(item_counter)06d', mode=ru.ID_CUSTOM)
            else:
                self._uid = ru.generate_id('rp.session', mode=ru.ID_PRIVATE)
                self._name = self._uid

            logger.report.info('<<create session %s' % self._uid)

            self._dbs = dbSession(sid=self._uid,
                                  name=self._name,
                                  dburl=self._dburl)

            self._dburl = self._dbs._dburl

            logger.info("New Session created: %s." % str(self))

        except Exception, ex:
            logger.exception('session create failed')
            raise PilotException("Couldn't create new session (database URL '%s' incorrect?): %s" \
                            % (self._dburl, ex))

        # initialize profiling
        self.prof = Profiler('%s' % self._uid)
        self.prof.prof('start session', uid=self._uid)

        # Loading all "default" resource configurations
        module_path = os.path.dirname(os.path.abspath(__file__))
        default_cfgs = "%s/configs/resource_*.json" % module_path
        config_files = glob.glob(default_cfgs)

        for config_file in config_files:

            try:
                logger.info("Load resource configurations from %s" %
                            config_file)
                rcs = ResourceConfig.from_file(config_file)
            except Exception as e:
                logger.error("skip config file %s: %s" % (config_file, e))
                continue

            for rc in rcs:
                logger.info("Load resource configurations for %s" % rc)
                self._resource_configs[rc] = rcs[rc].as_dict()

        user_cfgs = "%s/.radical/pilot/configs/resource_*.json" % os.environ.get(
            'HOME')
        config_files = glob.glob(user_cfgs)

        for config_file in config_files:

            try:
                rcs = ResourceConfig.from_file(config_file)
            except Exception as e:
                logger.error("skip config file %s: %s" % (config_file, e))
                continue

            for rc in rcs:
                logger.info("Load resource configurations for %s" % rc)

                if rc in self._resource_configs:
                    # config exists -- merge user config into it
                    ru.dict_merge(self._resource_configs[rc],
                                  rcs[rc].as_dict(),
                                  policy='overwrite')
                else:
                    # new config -- add as is
                    self._resource_configs[rc] = rcs[rc].as_dict()

        default_aliases = "%s/configs/resource_aliases.json" % module_path
        self._resource_aliases = ru.read_json_str(default_aliases)['aliases']

        self.prof.prof('configs parsed', uid=self._uid)

        _rec = os.environ.get('RADICAL_PILOT_RECORD_SESSION')
        if _rec:
            self._rec = "%s/%s" % (_rec, self._uid)
            os.system('mkdir -p %s' % self._rec)
            ru.write_json({'dburl': str(self._dburl)},
                          "%s/session.json" % self._rec)
            logger.info("recording session in %s" % self._rec)
        else:
            self._rec = None

        logger.report.ok('>>ok\n')
Ejemplo n.º 20
0
def get_profiles(command, tags=None, url=None, mode=None):

    print command

    if not url:
        url = os.environ.get('RADICAL_SYNAPSE_DBURL')

    if not url:
        print "warning: need dburl to retrieve profiles"
        return None

    url = ru.Url(url)

    if mode and not isinstance(mode, list):
        mode = [mode]

    if not tags:
        tags = dict()
        elems = filter(None,
                       os.environ.get('RADICAL_SYNAPSE_TAGS', '').split(','))
        for elem in elems:
            if ':' in elem:
                key, val = elem.split(':', 1)
                tags[key] = val
            else:
                tags[elem] = None

    command_idx = index_command(command, tags)

    if url.schema == 'mongodb':

        [dbhost, port, dbname, _, _, _, _] = ru.split_dburl(url)

        db_client = pymongo.MongoClient(host=dbhost, port=port)
        database = db_client[dbname]
        collection = database['profiles']

        # FIXME: eval partial tags

        if mode:
            results = collection.find({
                'type': 'synapse_profile',
                'tags': tags,
                'mode': {
                    '$in': mode
                },  # FIXME: check
                'command_idx': command_idx
            })
        else:
            results = collection.find({
                'type': 'synapse_profile',
                'tags': tags,
                'command_idx': command_idx
            })

        if not results.count():
            raise RuntimeError("Could not get profile for %s at %s/profiles" %
                               (command, url))

        ret = list(results)

    elif url.schema == 'file':

        path = url.path

        if not os.path.isdir(path):
            raise ValueError("dburl (%s) must point to an existing dir" % url)

        name = command_idx.split()[0]
        # for key, val in tags.iteritems():
        #     if val != None: name += "_%s:%s" % (key, val)
        #     else          : name += "_%s"    % (key)
        for tag in sorted(tags.keys()):
            if tags[tag] != None: name += "_%s" % tags[tag]
            else: name += "_%s" % tag

    # print    "checking profiles %s/synapse_profile_%s_*.json" % (path, name)
        base = "%s/synapse_profile_%s_*.json" % (path, name)
        fnames = glob.glob(base)
        ret = list()
        for fname in fnames:

            # print 'reading profile %s' % fname

            doc = ru.read_json_str(fname)
            use = False
            docmode = doc['mode'][0:3]

            doc['fname'] = fname

            if doc['command'] == command:
                if not mode:
                    use = True
                elif docmode in mode:
                    use = True
                else:
                    print "skip: mode %s not in %s" % (docmode, mode)
            else:
                print "skip command %s" % command
                print "   ! command %s" % doc['command']

            if use:
                ret.append(doc)

        if not len(ret):
            raise LookupError("No matching profile at %s" % base)

# print 'retrieved %d profiles from %s' % (len(ret), url)
# pprint.pprint (ret)

    return ret
Ejemplo n.º 21
0
""")

    import glob
    configs = glob.glob("%s/../../src/radical/pilot/configs/resource_*.json" % script_dir)
    for config in configs:

        if not config.endswith(".json"):
            continue # skip all non-python files

        if "/resource_aliases" in config:
            continue # skip alias files

        print " * %s" % config

        try: 
             json_data = ru.read_json_str(config)
        except Exception, ex:
             print "    * JSON PARSING ERROR: %s" % str(ex)
             continue

        config = config.split('/')[-1]

        resources_rst.write("{0}\n".format(config[:-5].upper()))
        resources_rst.write("{0}\n\n".format("="*len(config[:-5])))

        for host_key, resource_config in json_data.iteritems():
            resource_key = "%s.%s" % (config[:-5], host_key)
            print "   * %s" % resource_key
            try:
                default_queue = resource_config["default_queue"]
            except Exception, ex:
# 
# {
#     "scheduler"    : "rp.SCHED_BACKFILLING",
#     "resources"    : ["india.furturegrid.org", "sierra.futuregrid.org"],
#     "resource_cfg" :
#     {
#         "*.futuregrid.org" :
#         {
#             "username"      : "merzky"
#         }
#     }
# }
USER_CONFIG_PATH = os.environ.get ('HOME', '/tmp') + '/.my_app.cfg' 

# load the user config, and merge it with the default config
user_config = ru.read_json_str (USER_CONFIG_PATH)


# merge the user config into the app config, so that the user config keys are
# applied where appropriate
ru.dict_merge (app_config, user_config, policy='overwrite', wildcards=True)


# lets see what we got
pprint.pprint (app_config)


# this should result in :
#
# {
#     'log_level'   : 0,