def run (self) :
        """
        daemon workload

        The daemon will loop forever, sleeping self.idle_timeout seconds after
        each iteration.  Every iteration will query the Bundle Agents for status
        updates, and will push those to the given MongoDB URL.
        """

        try :

            if not self.config_file :
                raise RuntimeError ('no bundle config file -- call run() via start_daemon()!')

            self.load_cluster_credentials(self.config_file)


            while True :

                # FIXME: make configurable via config file
                mongo, db, dbname, cname, pname = ru.mongodb_connect (self.mongodb_url)

                coll_config    = db['config'   ]
                coll_workload  = db['workload' ]
                coll_bandwidth = db['bandwidth']

                ret = self.get_data ()

              # with open ('/tmp/l', 'w+') as log:
              #     import pprint
              #     pprint.pprint (ret)
              #     log.write ("\n\n%s\n\n" % pprint.pformat (ret))

                for cluster_ip in ret['cluster_list'] :

                    cluster_id = ip2id (cluster_ip)
                
                    config     = ret['cluster_config'   ][cluster_id]
                    workload   = ret['cluster_workload' ][cluster_id]
                    bandwidth  = ret['cluster_bandwidth'][cluster_id]

                    timestamp  = time.time ()

                    if config    : config    ['timestamp'] = timestamp
                    if workload  : workload  ['timestamp'] = timestamp
                    if bandwidth : bandwidth ['timestamp'] = timestamp

                    if config    : config    ['_id'] = cluster_id
                    if workload  : workload  ['_id'] = cluster_id
                    if bandwidth : bandwidth ['_id'] = cluster_id

                    if config    : coll_config   .update ({'_id': cluster_id}, config   , upsert=True)
                    if workload  : coll_workload .update ({'_id': cluster_id}, workload , upsert=True)
                    if bandwidth : coll_bandwidth.update ({'_id': cluster_id}, bandwidth, upsert=True)

                time.sleep (self.idle_timeout)

        except Exception as e :
            # FIXME: need a logfile from daemon base class
            raise
def dump (url, mode) :
    """
    Connect to mongodb at the given location, and traverse the data bases
    """

    mongo, db, dbname, cname, pname = ru.mongodb_connect (url, _DEFAULT_DBURL)

    print dbname
 
    if  dbname : dbnames = [dbname]
    else       : dbnames = mongo.database_names ()

    for name in dbnames :

        if  mode == 'list' and not dbname :
            print " +-- db   %s" % name

        elif  mode == 'remove' :
            
            if (not dbname) or (name == dbname) :
                try :
                    mongo.drop_database (name)
                    print "  removed database %s" % name
                except :
                    pass # ignore system tables

        else :
            handle_db (mongo, mode, name, cname, pname)

    mongo.disconnect ()
Esempio n. 3
0
    def __init__(self, db_url, db_name="AIMES_bundle"):
        url = ru.Url(db_url)
        if db_name:
            url.path = db_name

        mongo, db, dbname, _, _ = ru.mongodb_connect (url)

        self._client = mongo
        self._db     = db
        self._dbname = dbname
        self._dburl  = str(url)
        if url.username and url.password:
            self._dbauth = "{}:{}".format(url.username, url.password)
        else:
            self._dbauth = None

        self._session_id = None

        # shortcuts to collections
        #    db.session
        self._s   = None
        #    db.session.resource
        self._r = None
        #    db.session.resource.config
        self._rc   = None
        #    db.session.resource.workload
        self._rw   = None
        #    db.session.resource.bandwidth
        self._bw  = None
        #    db.session.bundle_manager
        self._bm  = None
Esempio n. 4
0
    def __init__(self, db_url, db_name="radicalpilot"):
        """ Le constructeur. Should not be called directrly, but rather
            via the static methods new() or reconnect().
        """

        url = ru.Url (db_url)

        if  db_name :
            url.path = db_name

        mongo, db, dbname, pname, cname = ru.mongodb_connect (url)

        self._client = mongo
        self._db     = db
        self._dburl  = str(url)
        self._dbname = dbname
        if url.username and url.password:
            self._dbauth = "%s:%s" % (url.username, url.password)
        else:
            self._dbauth = None

        self._session_id = None

        self._s  = None

        self._w  = None
        self._um = None

        self._p  = None
        self._pm = None
def fetch_json(sid, dburl=None, tgt=None, skip_existing=False, session=None,
        log=None):
    '''
    returns file name
    '''

    if not log and session:
        log = session._log
        rep = session._rep
    elif not log:
        log = ru.Logger('radical.pilot.utils')
        rep = ru.Reporter('radical.pilot.utils')

    if not tgt:
        tgt = '.'

    if tgt.startswith('/'):
        # Assume an absolute path
        dst = os.path.join(tgt, '%s.json' % sid)
    else:
        # Assume a relative path
        dst = os.path.join(os.getcwd(), tgt, '%s.json' % sid)

    try:
        os.makedirs(os.path.dirname(tgt))
    except OSError:
        pass # dir exists

    if skip_existing and os.path.isfile(dst) \
            and os.stat(dst).st_size > 0:

        log.info("session already in %s", dst)

    else:

        if not dburl:
            dburl = os.environ.get('RADICAL_PILOT_DBURL')

        if not dburl:
            raise ValueError('RADICAL_PILOT_DBURL is not set')

        mongo, db, _, _, _ = ru.mongodb_connect(dburl)

        json_docs = get_session_docs(db, sid)
        ru.write_json(json_docs, dst)

        log.info("session written to %s", dst)

        mongo.close()

    rep.ok("+ %s (json)\n" % sid)
    return dst
Esempio n. 6
0
    def __init__(self, sid, name, dburl):
        """ Creates a new session
            A session is a distinct collection with three sub-collections
            in MongoDB:

            radical.pilot.<sid>    | Base collection. Holds some metadata.   | self._s
            radical.pilot.<sid>.cu | Collection holding all compute units.   | self._w
            radical.pilot.<sid>.um | Collection holding all unit managers.   | self._um
            radical.pilot.<sid>.p  | Collection holding all pilots.          | self._p
            radical.pilot.<sid>.pm | Collection holding all pilot managers.  | self._pm

            All collections are created with a new session. Since MongoDB
            uses lazy-create, they only appear in the database after the
            first insert. That's ok.
        """

        # mpongodb_connect wants a string at the moment
        mongo, db, _, _, _ = ru.mongodb_connect(str(dburl))

        if not mongo or not db:
            raise RuntimeError("Could not connect to database at %s" % dburl)

        self._client     = mongo
        self._db         = db
        self._dburl      = ru.Url(dburl)
        self._session_id = sid
        self._created    = time.time()
        self._connected  = self._created
        self._closed     = None

        # make sure session doesn't exist already
        if self._db[sid].count() != 0:
            raise RuntimeError("Session '%s' already exists." % sid)

        # create the db entry
        self._s = self._db["%s" % sid]
        self._s.insert({"_id"       : sid,
                        "name"      : name,
                        "created"   : self._created,
                        "connected" : self._created})

        # Create the collection shortcut:
        self._w  = self._db["%s.cu" % sid]
        self._um = self._db["%s.um" % sid] 

        self._p  = self._db["%s.p"  % sid]
        self._pm = self._db["%s.pm" % sid] 
Esempio n. 7
0
def fetch_json(sid, dburl=None, tgt=None, skip_existing=False):

    '''
    returns file name
    '''

    if not tgt:
        tgt = '.'

    if tgt.startswith('/'):
        # Assume an absolute path
        dst = os.path.join(tgt, '%s.json' % sid)
    else:
        # Assume a relative path
        dst = os.path.join(os.getcwd(), tgt, '%s.json' % sid)

    if skip_existing and os.path.isfile(dst) \
            and os.stat(dst).st_size > 0:

        print "session already in %s" % dst

    else:

        if not dburl:
            dburl = os.environ.get('RADICAL_PILOT_DBURL')

        if not dburl:
            from radical.pilot.session import default_dburl
            logger.report.warn('using default dburl: %s' % default_dburl)
            dburl = default_dburl

        mongo, db, _, _, _ = ru.mongodb_connect(dburl)

        json_docs = get_session_docs(db, sid)
        ru.write_json(json_docs, dst)

        print "session written to %s" % dst

        mongo.close()

    return dst
Esempio n. 8
0
    def initialize_child(self):

        self._session_id    = self._cfg['session_id']
        self._mongodb_url   = self._cfg['mongodb_url']
        self._pilot_id      = self._cfg['pilot_id']

        _, db, _, _, _      = ru.mongodb_connect(self._mongodb_url)
        self._mongo_db      = db
        self._cinfo         = dict()            # collection cache
        self._lock          = threading.RLock() # protect _cinfo
        self._state_cache   = dict()            # used to preserve state ordering

        self.declare_subscriber('state', 'agent_state_pubsub', self.state_cb)
        self.declare_idle_cb(self.idle_cb, self._cfg.get('bulk_collection_time'))

        # all components use the command channel for control messages
        self.declare_publisher ('command', rpc.AGENT_COMMAND_PUBSUB)

        # communicate successful startup
        self.publish('command', {'cmd' : 'alive',
                                 'arg' : self.cname})
Esempio n. 9
0
    def query_db (self) :

        mongo, db, dbname, cname, pname = ru.mongodb_connect (self.mongodb_url)

        self._priv = dict()
        self._priv['cluster_list']      = list()
        self._priv['cluster_config']    = dict()
        self._priv['cluster_workload']  = dict()
        self._priv['cluster_bandwidth'] = dict()


        for doc in list(db['config'].find ()):
            self._priv['cluster_list'].append (doc['_id'])
            self._priv['cluster_config'][doc['_id']] = doc

        for doc in list(db['workload'].find ()):
            self._priv['cluster_workload'][doc['_id']] = doc

        for doc in list(db['bandwidth'].find ()):
            self._priv['cluster_bandwidth'][doc['_id']] = doc


        # we have a dictionary of Resources instances, indexed by resource name
        self.resources = dict()
        for resource_name in self._priv['cluster_list']:

            config     = self._priv['cluster_config'   ].get (resource_name, dict())
            workload   = self._priv['cluster_workload' ].get (resource_name, dict())
            bandwidths = self._priv['cluster_bandwidth'].get (resource_name, dict())

            # import pprint
            # pprint.pprint(bandwidths)

            self.resources[resource_name] = Resource(resource_name, config, workload, bandwidths)


        # and a list of Queue instances, for all queues of all resources
        self.queues = list()
        for resource in self.resources:
            self.queues += self.resources[resource].queues.values()
Esempio n. 10
0
    def initialize_child(self):

        self._session_id = self._cfg['session_id']
        self._dburl      = self._cfg['dburl']
        self._owner      = self._cfg['owner']

        # TODO: get db handle from a connected session
        _, db, _, _, _   = ru.mongodb_connect(self._dburl)
        self._mongo_db   = db
        self._coll       = self._mongo_db[self._session_id]
        self._bulk       = self._coll.initialize_ordered_bulk_op()
        self._last       = time.time()        # time of last bulk push
        self._uids       = list()             # list of collected uids
        self._lock       = threading.RLock()  # protect _bulk

        self._bct        = self._cfg.get('bulk_collection_time',
                                          DEFAULT_BULK_COLLECTION_TIME)
        self._bcs        = self._cfg.get('bulk_collection_size',
                                          DEFAULT_BULK_COLLECTION_SIZE)

        self.register_subscriber(rpc.STATE_PUBSUB, self._state_cb)
        self.register_timed_cb(self._idle_cb, timer=self._bct)
Esempio n. 11
0
    def initialize_child(self):

        self._session_id    = self._cfg['session_id']
        self._mongodb_url   = self._cfg['mongodb_url']

        self.declare_idle_cb(self.idle_cb, self._cfg.get('heartbeat_interval'))

        # all components use the command channel for control messages
        self.declare_publisher ('command', rpc.AGENT_COMMAND_PUBSUB)

        self._pilot_id      = self._cfg['pilot_id']
        self._session_id    = self._cfg['session_id']
        self._runtime       = self._cfg['runtime']
        self._starttime     = time.time()

        # set up db connection
        _, mongo_db, _, _, _  = ru.mongodb_connect(self._cfg['mongodb_url'])

        self._p  = mongo_db["%s.p"  % self._session_id]
        self._cu = mongo_db["%s.cu" % self._session_id]

        # communicate successful startup
        self.publish('command', {'cmd' : 'alive',
                                 'arg' : self.cname})
Esempio n. 12
0
def fetch_logfiles (sid, dburl=None, src=None, tgt=None, access=None, 
        session=None, skip_existing=False, fetch_client=False, log=None):
    '''
    sid: session for which all logfiles are fetched
    src: dir to look for client session logfiles
    tgt: dir to store the logfile in

    returns list of file names
    '''

    if not log and session:
        log = session._log
        rep = session._rep
    elif not log:
        log = ru.Logger('radical.pilot.utils')
        rep = ru.Reporter('radical.pilot.utils')

    ret = list()

    if not dburl:
        dburl = os.environ['RADICAL_PILOT_DBURL']

    if not dburl:
        raise RuntimeError ('Please set RADICAL_PILOT_DBURL')

    if not src:
        src = os.getcwd()
            
    if not tgt:
        tgt = os.getcwd()
            
    if not tgt.startswith('/') and '://' not in tgt:
        tgt = "%s/%s" % (os.getcwd(), tgt)

    # we always create a session dir as real target
    tgt_url = rs.Url("%s/%s/" % (tgt, sid))

    # Turn URLs without schema://host into file://localhost,
    # so that they dont become interpreted as relative.
    if not tgt_url.schema:
        tgt_url.schema = 'file'
    if not tgt_url.host:
        tgt_url.host = 'localhost'

    if fetch_client:
        # first fetch session logfile
        client_logfile = "%s/%s.log" % (src, sid)

        ftgt = rs.Url('%s/%s' % (tgt_url, os.path.basename(client_logfile)))
        ret.append("%s" % ftgt.path)

        if skip_existing and os.path.isfile(ftgt.path) \
                and os.stat(ftgt.path).st_size > 0:
            pass
        else:
            log_file = rs.fs.File(client_logfile, session=session)
            log_file.copy(ftgt, flags=rs.fs.CREATE_PARENTS)
            log_file.close()

    _, db, _, _, _ = ru.mongodb_connect (dburl)

    json_docs = get_session_docs(db, sid)

    pilots = json_docs['pilot']
    num_pilots = len(pilots)
    log.info("Session: %s", sid)
    log.info("Number of pilots in session: %d", num_pilots)


    for pilot in pilots:

        try:
            sandbox_url = rs.Url(pilot['pilot_sandbox'])

            if access:
                # Allow to use a different access schema than used for the the run.
                # Useful if you ran from the headnode, but would like to retrieve
                # the logfiles to your desktop (Hello Titan).
                access_url = rs.Url(access)
                sandbox_url.schema = access_url.schema
                sandbox_url.host   = access_url.host

            sandbox  = rs.fs.Directory (sandbox_url, session=session)

            # Try to fetch a tarball of logfiles, so that we can get them all in one (SAGA) go!
            LOGFILES_TARBALL  = '%s.log.tgz' % pilot['uid']
            tarball_available = False
            try:
                if  sandbox.is_file(LOGFILES_TARBALL) and \
                    sandbox.get_size(LOGFILES_TARBALL):

                    log.info("logfiles tarball exists")
                    ftgt = rs.Url('%s/%s' % (tgt_url, LOGFILES_TARBALL))

                    if skip_existing and os.path.isfile(ftgt.path) \
                            and os.stat(ftgt.path).st_size > 0:

                        log.info("Skip fetching of '%s/%s' to '%s'.", 
                                 sandbox_url, LOGFILES_TARBALL, tgt_url)
                        tarball_available = True
                    else:

                        log.info("Fetching '%s%s' to '%s'.", 
                                sandbox_url, LOGFILES_TARBALL, tgt_url)
                        log_file = rs.fs.File("%s%s" % (sandbox_url, LOGFILES_TARBALL), session=session)
                        log_file.copy(ftgt, flags=rs.fs.CREATE_PARENTS)
                        log_file.close()

                        tarball_available = True
                else:
                    log.warn("logiles tarball doesnt exists")

            except rs.DoesNotExist:
                log.warn("logfiles tarball doesnt exists")

            try:
                os.mkdir("%s/%s" % (tgt_url.path, pilot['uid']))
            except OSError:
                pass

            # We now have a local tarball
            if tarball_available:
                log.debug("Extract tarball %s to %s", ftgt.path, tgt_url.path)

                try:
                    tarball = tarfile.open(ftgt.path)
                    tarball.extractall("%s/%s" % (tgt_url.path, pilot['uid']))

                    logfiles = glob.glob("%s/%s/*.log" % (tgt_url.path, pilot['uid']))
                    log.info("tarball %s extracted to '%s/%s/'.", 
                            ftgt.path, tgt_url.path, pilot['uid'])
                    ret.extend(logfiles)
                    os.unlink(ftgt.path)

                except Exception as e:
                    log.warn('could not extract tarball %s [%s]', ftgt.path, e)

                # If extract succeeded, no need to fetch individual logfiles
                rep.ok("+ %s (logfiles)\n" % pilot['uid'])
                continue

            # If we dont have a tarball (for whichever reason), fetch individual logfiles
            logfiles = sandbox.list('*.log')

            for logfile in logfiles:

                ftgt = rs.Url('%s/%s/%s' % (tgt_url, pilot['uid'], logfile))
                ret.append("%s" % ftgt.path)

                if skip_existing and os.path.isfile(ftgt.path) \
                                 and os.stat(ftgt.path).st_size > 0:

                    continue

                log_file = rs.fs.File("%s%s" % (sandbox_url, logfile), session=session)
                log_file.copy(ftgt, flags=rs.fs.CREATE_PARENTS)
                log_file.close()

            rep.ok("+ %s (logfiles)\n" % pilot['uid'])

        except Exception as e:
            rep.error("- %s (logfiles)\n" % pilot['uid'])

    return ret
Esempio n. 13
0
def fetch_logfiles(sid,
                   dburl=None,
                   src=None,
                   tgt=None,
                   access=None,
                   session=None,
                   skip_existing=False,
                   fetch_client=False,
                   log=None):
    '''
    sid: session for which all logfiles are fetched
    src: dir to look for client session logfiles
    tgt: dir to store the logfile in

    returns list of file names
    '''

    if not log and session:
        log = session._log
        rep = session._rep
    elif not log:
        log = ru.Logger('radical.pilot.utils')
        rep = ru.Reporter('radical.pilot.utils')

    ret = list()

    if not dburl:
        dburl = os.environ['RADICAL_PILOT_DBURL']

    if not dburl:
        raise RuntimeError('Please set RADICAL_PILOT_DBURL')

    if not src:
        src = os.getcwd()

    if not tgt:
        tgt = os.getcwd()

    if not tgt.startswith('/') and '://' not in tgt:
        tgt = "%s/%s" % (os.getcwd(), tgt)

    # we always create a session dir as real target
    tgt_url = saga.Url("%s/%s/" % (tgt, sid))

    # Turn URLs without schema://host into file://localhost,
    # so that they dont become interpreted as relative.
    if not tgt_url.schema:
        tgt_url.schema = 'file'
    if not tgt_url.host:
        tgt_url.host = 'localhost'

    if fetch_client:
        # first fetch session logfile
        client_logfile = "%s/%s.log" % (src, sid)

        ftgt = saga.Url('%s/%s' % (tgt_url, os.path.basename(client_logfile)))
        ret.append("%s" % ftgt.path)

        if skip_existing and os.path.isfile(ftgt.path) \
                and os.stat(ftgt.path).st_size > 0:
            pass
        else:
            log_file = saga.filesystem.File(client_logfile, session=session)
            log_file.copy(ftgt, flags=saga.filesystem.CREATE_PARENTS)
            log_file.close()

    _, db, _, _, _ = ru.mongodb_connect(dburl)

    json_docs = get_session_docs(db, sid)

    pilots = json_docs['pilot']
    num_pilots = len(pilots)
    log.info("Session: %s", sid)
    log.info("Number of pilots in session: %d", num_pilots)

    for pilot in pilots:

        try:
            sandbox_url = saga.Url(pilot['pilot_sandbox'])

            if access:
                # Allow to use a different access schema than used for the the run.
                # Useful if you ran from the headnode, but would like to retrieve
                # the logfiles to your desktop (Hello Titan).
                access_url = saga.Url(access)
                sandbox_url.schema = access_url.schema
                sandbox_url.host = access_url.host

            sandbox = saga.filesystem.Directory(sandbox_url, session=session)

            # Try to fetch a tarball of logfiles, so that we can get them all in one (SAGA) go!
            LOGFILES_TARBALL = '%s.log.tgz' % pilot['uid']
            tarball_available = False
            try:
                if  sandbox.is_file(LOGFILES_TARBALL) and \
                    sandbox.get_size(LOGFILES_TARBALL):

                    log.info("logfiles tarball exists")
                    ftgt = saga.Url('%s/%s' % (tgt_url, LOGFILES_TARBALL))

                    if skip_existing and os.path.isfile(ftgt.path) \
                            and os.stat(ftgt.path).st_size > 0:

                        log.info("Skip fetching of '%s/%s' to '%s'.",
                                 sandbox_url, LOGFILES_TARBALL, tgt_url)
                        tarball_available = True
                    else:

                        log.info("Fetching '%s%s' to '%s'.", sandbox_url,
                                 LOGFILES_TARBALL, tgt_url)
                        log_file = saga.filesystem.File(
                            "%s%s" % (sandbox_url, LOGFILES_TARBALL),
                            session=session)
                        log_file.copy(ftgt,
                                      flags=saga.filesystem.CREATE_PARENTS)
                        log_file.close()

                        tarball_available = True
                else:
                    log.warn("logiles tarball doesnt exists")

            except saga.DoesNotExist:
                log.warn("logfiles tarball doesnt exists")

            try:
                os.mkdir("%s/%s" % (tgt_url.path, pilot['uid']))
            except OSError:
                pass

            # We now have a local tarball
            if tarball_available:
                log.debug("Extract tarball %s to %s", ftgt.path, tgt_url.path)

                try:
                    tarball = tarfile.open(ftgt.path)
                    tarball.extractall("%s/%s" % (tgt_url.path, pilot['uid']))

                    logfiles = glob.glob("%s/%s/*.log" %
                                         (tgt_url.path, pilot['uid']))
                    log.info("tarball %s extracted to '%s/%s/'.", ftgt.path,
                             tgt_url.path, pilot['uid'])
                    ret.extend(logfiles)
                    os.unlink(ftgt.path)

                except Exception as e:
                    log.warn('could not extract tarball %s [%s]', ftgt.path, e)

                # If extract succeeded, no need to fetch individual logfiles
                rep.ok("+ %s (logfiles)\n" % pilot['uid'])
                continue

            # If we dont have a tarball (for whichever reason), fetch individual logfiles
            logfiles = sandbox.list('*.log')

            for logfile in logfiles:

                ftgt = saga.Url('%s/%s/%s' % (tgt_url, pilot['uid'], logfile))
                ret.append("%s" % ftgt.path)

                if skip_existing and os.path.isfile(ftgt.path) \
                                 and os.stat(ftgt.path).st_size > 0:

                    continue

                log_file = saga.filesystem.File("%s%s" %
                                                (sandbox_url, logfile),
                                                session=session)
                log_file.copy(ftgt, flags=saga.filesystem.CREATE_PARENTS)
                log_file.close()

            rep.ok("+ %s (logfiles)\n" % pilot['uid'])

        except Exception as e:
            rep.error("- %s (logfiles)\n" % pilot['uid'])

    return ret
Esempio n. 14
0
    def initialize_child(self):
        """
        Read the configuration file, setup logging and mongodb connection.
        This prepares the stage for the component setup (self._setup()).
        """

        # keep track of objects we need to stop in the finally clause
        self._sub_agents = dict()
        self._components = dict()
        self._workers    = dict()

        # sanity check on config settings
        if not 'cores'               in self._cfg: raise ValueError("Missing number of cores")
        if not 'debug'               in self._cfg: raise ValueError("Missing DEBUG level")
        if not 'lrms'                in self._cfg: raise ValueError("Missing LRMS")
        if not 'mongodb_url'         in self._cfg: raise ValueError("Missing MongoDB URL")
        if not 'pilot_id'            in self._cfg: raise ValueError("Missing pilot id")
        if not 'runtime'             in self._cfg: raise ValueError("Missing or zero agent runtime")
        if not 'scheduler'           in self._cfg: raise ValueError("Missing agent scheduler")
        if not 'session_id'          in self._cfg: raise ValueError("Missing session id")
        if not 'spawner'             in self._cfg: raise ValueError("Missing agent spawner")
        if not 'task_launch_method'  in self._cfg: raise ValueError("Missing unit launch method")
        if not 'agent_layout'        in self._cfg: raise ValueError("Missing agent layout")

        self._pilot_id   = self._cfg['pilot_id']
        self._session_id = self._cfg['session_id']
        self._runtime    = self._cfg['runtime']
        self._sub_cfg    = self._cfg['agent_layout'][self.agent_name]
        self._pull_units = self._sub_cfg.get('pull_units', False)

        # this better be on a shared FS!
        self._cfg['workdir'] = os.getcwd()

        # another sanity check
        if self.agent_name == 'agent_0':
            if self._sub_cfg.get('target', 'local') != 'local':
                raise ValueError("agent_0 must run on target 'local'")

        # configure the agent logger
        self._log.setLevel(self._cfg['debug'])

        # set up db connection -- only for the master agent and for the agent
        # which pulls units (which might be the same)
        if self.agent_name == 'agent_0' or self._pull_units:
            self._log.debug('connecting to mongodb at %s for unit pull')
            _, mongo_db, _, _, _  = ru.mongodb_connect(self._cfg['mongodb_url'])

            self._p  = mongo_db["%s.p"  % self._session_id]
            self._cu = mongo_db["%s.cu" % self._session_id]
            self._log.debug('connected to mongodb')

        # first order of business: set the start time and state of the pilot
        # Only the master agent performs this action
        if self.agent_name == 'agent_0':
            now = time.time()
            ret = self._p.update(
                {"_id": self._pilot_id},
                {"$set" : {"state"        : rps.ACTIVE,
                           "started"      : now},
                 "$push": {"statehistory" : {"state"    : rps.ACTIVE,
                                             "timestamp": now}}
                })
            # TODO: Check for return value, update should be true!
            self._log.info("Database updated: %s", ret)

        # make sure we collect commands, specifically to implement the startup
        # barrier on bootstrap_4
        self.declare_publisher ('command', rpc.AGENT_COMMAND_PUBSUB)
        self.declare_subscriber('command', rpc.AGENT_COMMAND_PUBSUB, self.barrier_cb)

        # Now instantiate all communication and notification channels, and all
        # components and workers.  It will then feed a set of units to the
        # lead-in queue (staging_input).  A state notification callback will
        # then register all units which reached a final state (DONE).  Once all
        # units are accounted for, it will tear down all created objects.

        # we pick the layout according to our role (name)
        # NOTE: we don't do sanity checks on the agent layout (too lazy) -- but
        #       we would hiccup badly over ill-formatted or incomplete layouts...
        if not self.agent_name in self._cfg['agent_layout']:
            raise RuntimeError("no agent layout section for %s" % self.agent_name)

        try:
            self.start_sub_agents()
            self.start_components()

            # before we declare bootstrapping-success, the we wait for all
            # components, workers and sub_agents to complete startup.  For that,
            # all sub-agents will wait ALIVE messages on the COMMAND pubsub for
            # all entities it spawned.  Only when all are alive, we will
            # continue here.
            self.alive_barrier()

        except Exception as e:
            self._log.exception("Agent setup error: %s" % e)
            raise

        self._prof.prof('Agent setup done', logger=self._log.debug, uid=self._pilot_id)

        # also watch all components (once per second)
        self.declare_idle_cb(self.watcher_cb, 10.0)

        # once bootstrap_4 is done, we signal success to the parent agent
        # -- if we have any parent...
        if self.agent_name != 'agent_0':
            self.publish('command', {'cmd' : 'alive',
                                     'arg' : self.agent_name})

        # the pulling agent registers the staging_input_queue as this is what we want to push to
        # FIXME: do a sanity check on the config that only one agent pulls, as
        #        this is a non-atomic operation at this point
        self._log.debug('agent will pull units: %s' % bool(self._pull_units))
        if self._pull_units:

            self.declare_output(rps.AGENT_STAGING_INPUT_PENDING, rpc.AGENT_STAGING_INPUT_QUEUE)
            self.declare_publisher('state', rpc.AGENT_STATE_PUBSUB)

            # register idle callback, to pull for units -- which is the only action
            # we have to perform, really
            self.declare_idle_cb(self.idle_cb, self._cfg['db_poll_sleeptime'])
Esempio n. 15
0
def get_session_frames (sids, db=None, cachedir=None) :

    # use like this: 
    #
    # session_frame, pilot_frame, unit_frame = rpu.get_session_frames (session, db, cachedir)
    # pandas.set_option('display.width', 1000)
    # print session_frame
    # print pilot_frame
    # print unit_frame
    #
    # u_min = unit_frame.ix[unit_frame['started'].idxmin()]['started']
    # u_max = unit_frame.ix[unit_frame['finished'].idxmax()]['finished']
    # print u_min
    # print u_max
    # print u_max - u_min

    mongo = None

    if not db:
        dburl = os.environ.get('RADICAL_PILOT_DBURL')
        if not dburl:
            raise RuntimeError ('Please set RADICAL_PILOT_DBURL')

        mongo, db, _, _, _ = ru.mongodb_connect(dburl)


    if not isinstance (sids, list) :
        sids = [sids]

    session_dicts = list()
    pilot_dicts   = list()
    unit_dicts    = list()

    for sid in sids :

        docs = get_session_docs (db, sid, cachedir=cachedir)

        session       = docs['session']
        session_start = session['created']
        session_dict  = {
            'sid'       : sid,
            'started'   : session['created'],
            'finished'  : None, 
            'n_pilots'  : len(docs['pilot']),
            'n_units'   : 0
            }

        last_pilot_event = 0
        for pilot in docs['pilot'] :

            pid         = pilot['_id']
            description = pilot.get ('description', dict())
            started     = pilot.get ('started')
            finished    = pilot.get ('finished')
            
            cores = 0

            if pilot['nodes'] and pilot['cores_per_node']:
                cores = len(pilot['nodes']) * pilot['cores_per_node']
            else:
                cores = description.get('cores')

            if started  : started  -= session_start
            if finished : finished -= session_start

            pilot_dict = {
                'sid'          : sid,
                'pid'          : pid, 
                'n_units'      : len(pilot.get ('unit_ids', list())), 
                'started'      : started,
                'finished'     : finished,
                'resource'     : description.get ('resource'),
                'cores'        : cores,
                'runtime'      : description.get ('runtime'),
                NEW            : None, 
                PENDING_LAUNCH : None, 
                LAUNCHING      : None, 
                PENDING_ACTIVE : None, 
                ACTIVE         : None, 
                DONE           : None, 
                FAILED         : None, 
                CANCELED       : None
            }

            for entry in pilot.get('statehistory', list()):
                state = entry['state']
                timer = entry['timestamp'] - session_start
                pilot_dict[state] = timer
                last_pilot_event  = max(last_pilot_event, timer)

            if not pilot_dict[NEW]:
                if pilot_dict[PENDING_LAUNCH]:
                    pilot_dict[NEW] = pilot_dict[PENDING_LAUNCH]
                else:
                    pilot_dict[NEW] = pilot_dict[LAUNCHING]

            pilot_dicts.append (pilot_dict)


        for unit in docs['unit']:

            uid         = unit['_id']
            started     = unit.get ('started')
            finished    = unit.get ('finished')
            description = unit.get ('description', dict())

            if started  : started  -= session_start
            if finished : finished -= session_start

            session_dict['n_units'] += 1

            unit_dict = {
                'sid'                  : sid, 
                'pid'                  : unit.get('pilot'), 
                'uid'                  : uid, 
                'started'              : started,
                'finished'             : finished,
                'cores'                : description.get ('cores'),
                'slots'                : unit.get ('slots'),
                NEW                    : None, 
                UNSCHEDULED            : None, 
                PENDING_INPUT_STAGING  : None, 
                STAGING_INPUT          : None, 
                EXECUTING_PENDING      : None,
                SCHEDULING             : None, 
                ALLOCATING             : None, 
                EXECUTING              : None, 
                PENDING_OUTPUT_STAGING : None, 
                STAGING_OUTPUT         : None, 
                DONE                   : None, 
                FAILED                 : None, 
                CANCELED               : None
            }

            for entry in unit.get('statehistory', list()):
                state = entry['state']
                timer = entry['timestamp'] - session_start
                unit_dict[state] = timer

            # FIXME: there is more state messup afloat: some states are missing,
            # even though we know they have happened.  For one, we see data
            # being staged w/o having a record of InputStaging states.  Or we
            # find callback history entries for states which are not in the
            # history...
            #
            # We try to clean up to some extent.  The policy is like this, for
            # any [pending_state, state] pair:
            #
            # - if both are in the hist: great
            # - if one is in the hist, and the other in the cb hist, use like
            #   that, but ensure that pending_state <= state
            # - if both are in cb_hist, use them, apply same ordering assert.
            #   Use median if ordering is wrong
            # - if only on is in cb_host, use the same value for the other one
            # - if neither is anywhere, leave unset
            rec_hist = dict()
            cb_hist  = dict()

            for e in unit.get('statehistory', list()):
                state = e['state']
                timer = e['timestamp'] - session_start
                if state not in rec_hist:
                    rec_hist[state] = list()
                rec_hist[state].append(timer)

            for e in unit.get('callbackhistory', list()):
                state = e['state']
                timer = e['timestamp'] - session_start
                if state not in cb_hist:
                    cb_hist[state] = list()
                cb_hist[state].append(timer)

            statepairs = {STAGING_INPUT  : PENDING_INPUT_STAGING ,
                          STAGING_OUTPUT : PENDING_OUTPUT_STAGING}

            primary_states = [NEW                   ,
                              UNSCHEDULED           ,
                              STAGING_INPUT         ,
                              EXECUTING_PENDING     ,
                              SCHEDULING            ,
                              ALLOCATING            ,
                              EXECUTING             ,
                              STAGING_OUTPUT        ,
                              DONE                  ,
                              CANCELED              ,
                              FAILED                ]

            for state in primary_states:

                pend    = None
                t_state = None
                t_pend  = None

                ts_rec  = rec_hist.get (state) #         state time stamp from state hist
                ts_cb   = cb_hist.get  (state) #         state time stamp from cb    hist
                tp_rec  = None                 # pending state time stamp from state hist
                tp_cb   = None                 # pending state time stamp from cb    hist

                if  state in statepairs:
                    pend   = statepairs[state]
                    tp_rec = rec_hist.get (pend)
                    tp_cb  = cb_hist.get  (pend)

                # try to find a candidate for state timestamp
                if   ts_rec : t_state = ts_rec[0]
                elif ts_cb  : t_state = ts_cb [0]
                elif tp_rec : t_state = tp_rec[0]
                elif tp_cb  : t_state = tp_cb [0]

                # try to find a candidate for pending timestamp
                if   tp_rec : t_pend  = tp_rec[0]
                elif tp_cb  : t_pend  = tp_cb [0]

                # if there is no t_pend, check if there are two state times on
                # record (in the state hist), and if so, reorder
                if pend :
                    if t_state and not t_pend:
                        if ts_rec and len(ts_rec) == 2:
                            t_pend  = min (ts_rec)
                            t_state = max (ts_rec)
                        else:
                            t_pend  = t_state

                # make sure that any pending time comes before state time
                if pend:
                    if t_pend > t_state:
                      # print "%s : %s" % (uid, state)
                        t_med   = (t_pend + t_state) / 2
                        t_pend  = t_med
                        t_state = t_med

                # record the times for the data frame
                unit_dict[state] = t_state

                if pend :
                    unit_dict[pend] = t_pend


            if unit_dict[UNSCHEDULED] and unit_dict[SCHEDULING]:
                unit_dict[UNSCHEDULED] = min(unit_dict[UNSCHEDULED], unit_dict[SCHEDULING])

            if not unit_dict[NEW]:
                if unit_dict[UNSCHEDULED]:
                    unit_dict[NEW] = unit_dict[UNSCHEDULED]
                if unit_dict[SCHEDULING]:
                    unit_dict[NEW] = unit_dict[SCHEDULING]


            unit_dicts.append (unit_dict)
        
        session_dict['finished'] = last_pilot_event
        session_dicts.append (session_dict)

    import pandas 
    session_frame = pandas.DataFrame (session_dicts)
    pilot_frame   = pandas.DataFrame (pilot_dicts)
    unit_frame    = pandas.DataFrame (unit_dicts)

    if mongo:
        mongo.close()

    return session_frame, pilot_frame, unit_frame
Esempio n. 16
0
def fetch_logfiles (sid, dburl=None, client=None, tgt=None, access=None, 
        session=None, skip_existing=False):
    '''
    sid: session for which all logfiles are fetched
    client: dir to look for client session logfiles
    tgt: dir to store the logfile in

    returns list of file names
    '''


    ret = list()

    if not dburl:
        dburl = os.environ.get('RADICAL_PILOT_DBURL')

    if not dburl:
        from radical.pilot.session import default_dburl
        logger.report.warn('using default dburl: %s' % default_dburl)
        dburl = default_dburl

    if not client:
        client = os.getcwd()
            
    if not tgt:
        tgt = os.getcwd()
            
    if not tgt.startswith('/') and '://' not in tgt:
        tgt = "%s/%s" % (os.getcwd(), tgt)

    # we always create a session dir as real target
    tgt_url = saga.Url(tgt)

    # Turn URLs without schema://host into file://localhost,
    # so that they dont become interpreted as relative.
    if not tgt_url.schema:
        tgt_url.schema = 'file'
    if not tgt_url.host:
        tgt_url.host = 'localhost'

    # first fetch session logfile
    # FIXME: should we record pwd or logfile location in db session?  Or create
    #        a sandbox like dir for storing logfiles and logs?
    client_logfile = "%s/%s.log" % (client, sid)

    ftgt = saga.Url('%s/%s' % (tgt_url, os.path.basename(client_logfile)))
    ret.append("%s" % ftgt.path)

    if skip_existing and os.path.isfile(ftgt.path) \
            and os.stat(ftgt.path).st_size > 0:

        logger.report.info("\t- %s\n" % client_logfile.split('/')[-1])

    else:

        if not os.path.isfile(client_logfile):
            print 'skipping client logfile: %s does not exist' % client_logfile

        else:
            logger.report.info("\t+ %s\n" % client_logfile.split('/')[-1])
            log_file = saga.filesystem.File(client_logfile, session=session)
            log_file.copy(ftgt, flags=saga.filesystem.CREATE_PARENTS)
            log_file.close()

    _, db, _, _, _ = ru.mongodb_connect (dburl)

    json_docs = get_session_docs(db, sid)

    pilots = json_docs['pilot']
    num_pilots = len(pilots)
 #  print "Session: %s" % sid
 #  print "Number of pilots in session: %d" % num_pilots

    for pilot in pilots:

      # print "Processing pilot '%s'" % pilot['_id']

        sandbox_url = saga.Url(pilot['sandbox'])

        if access:
            # Allow to use a different access scheme than used for the the run.
            # Useful if you ran from the headnode, but would like to retrieve
            # the logfiles to your desktop (Hello Titan).
            access_url = saga.Url(access)
            sandbox_url.schema = access_url.schema
            sandbox_url.host = access_url.host

          # print "Overriding remote sandbox: %s" % sandbox_url

        sandbox  = saga.filesystem.Directory (sandbox_url, session=session)

        # Try to fetch a tarball of logfiles, so that we can get them all in one (SAGA) go!
        LOGFILES_TARBALL = '%s.log.tgz' % pilot['_id']
        tarball_available = False
        try:
            if sandbox.is_file(LOGFILES_TARBALL):
                print "Logfiles tarball exists!"

                ftgt = saga.Url('%s/%s' % (tgt_url, LOGFILES_TARBALL))

                if skip_existing and os.path.isfile(ftgt.path) \
                        and os.stat(ftgt.path).st_size > 0:

                    print "Skipping fetching of '%s/%s' to '%s'." % (sandbox_url, LOGFILES_TARBALL, tgt_url)
                    tarball_available = True
                else:

                    print "Fetching '%s%s' to '%s'." % (sandbox_url, LOGFILES_TARBALL, tgt_url)
                    log_file = saga.filesystem.File("%s%s" % (sandbox_url, LOGFILES_TARBALL), session=session)
                    log_file.copy(ftgt, flags=saga.filesystem.CREATE_PARENTS)
                    log_file.close()

                    tarball_available = True
            else:
                print "Logfiles tarball doesnt exists!"

        except saga.DoesNotExist:
            print "exception(TODO): logfiles tarball doesnt exists!"

        try:
            os.mkdir("%s/%s" % (tgt_url.path, pilot['_id']))
        except OSError:
            pass

        # We now have a local tarball
        if tarball_available:
            print "Extracting tarball %s into '%s'." % (ftgt.path, tgt_url.path)
            tarball = tarfile.open(ftgt.path)
            tarball.extractall("%s/%s" % (tgt_url.path, pilot['_id']))

            logfiles = glob.glob("%s/%s/*.log" % (tgt_url.path, pilot['_id']))
            print "Tarball %s extracted to '%s/%s/'." % (ftgt.path, tgt_url.path, pilot['_id'])
            ret.extend(logfiles)
            os.unlink(ftgt.path)

            # If extract succeeded, no need to fetch individual logfiles
            continue

        # If we dont have a tarball (for whichever reason), fetch individual logfiles
        logfiles = sandbox.list('*.log')

        for log in logfiles:

            ftgt = saga.Url('%s/%s/%s' % (tgt_url, pilot['_id'], log))
            ret.append("%s" % ftgt.path)

            if skip_existing and os.path.isfile(ftgt.path) \
                             and os.stat(ftgt.path).st_size > 0:

                logger.report.info("\t- %s\n" % str(log).split('/')[-1])
                continue

            logger.report.info("\t+ %s\n" % str(log).split('/')[-1])
            log_file = saga.filesystem.File("%s%s" % (sandbox_url, log), session=session)
            log_file.copy(ftgt, flags=saga.filesystem.CREATE_PARENTS)
            log_file.close()

    return ret
Esempio n. 17
0
def fetch_profiles (sid, dburl=None, src=None, tgt=None, access=None, 
        session=None, skip_existing=False):
    '''
    sid: session for which all profiles are fetched
    src: dir to look for client session profiles ($src/$sid/*.prof)
    tgt: dir to store the profile in
         - $tgt/$sid/*.prof,
         - $tgt/$sid/$pilot_id/*.prof)

    returns list of file names
    '''

    log = ru.get_logger('radical.pilot.utils')

    ret = list()

    if not dburl:
        dburl = os.environ['RADICAL_PILOT_DBURL']

    if not dburl:
        raise RuntimeError ('Please set RADICAL_PILOT_DBURL')

    if not src:
        src = os.getcwd()
            
    if not tgt:
        tgt = os.getcwd()
            
    if not tgt.startswith('/') and '://' not in tgt:
        tgt = "%s/%s" % (os.getcwd(), tgt)

    # we always create a session dir as real target
    tgt_url = saga.Url("%s/%s/" % (tgt, sid))

    # Turn URLs without schema://host into file://localhost,
    # so that they dont become interpreted as relative.
    if not tgt_url.schema:
        tgt_url.schema = 'file'
    if not tgt_url.host:
        tgt_url.host = 'localhost'

    # first fetch session profile
    # FIXME: should we record pwd or profile location in db session?  Or create
    #        a sandbox like dir for storing profiles and logs?
    client_profiles = glob.glob("%s/%s/*.prof" % (src, sid))
    if not client_profiles:
        raise RuntimeError('no client profiles in %s/%s' % (src, sid))

    for client_profile in client_profiles:

        ftgt = saga.Url('%s/%s' % (tgt_url, os.path.basename(client_profile)))
        ret.append("%s" % ftgt.path)

        if skip_existing and os.path.isfile(ftgt.path) \
                and os.stat(ftgt.path).st_size > 0:

            logger.report.info("\t- %s\n" % client_profile.split('/')[-1])

        else:

            logger.report.info("\t+ %s\n" % client_profile.split('/')[-1])
            prof_file = saga.filesystem.File(client_profile, session=session)
            prof_file.copy(ftgt, flags=saga.filesystem.CREATE_PARENTS)
            prof_file.close()

    _, db, _, _, _ = ru.mongodb_connect (dburl)

    json_docs = get_session_docs(db, sid)

    pilots = json_docs['pilot']
    num_pilots = len(pilots)
    log.debug("Session: %s", sid)
    log.debug("Number of pilots in session: %d", num_pilots)

    for pilot in pilots:

        pilot['uid'] = pilot['_id']

        log.debug("processing pilot '%s'", pilot['uid'])

        sandbox_url = saga.Url(pilot['sandbox'])

        if access:
            # Allow to use a different access schema than used for the the run.
            # Useful if you ran from the headnode, but would like to retrieve
            # the profiles to your desktop (Hello Titan).
            access_url = saga.Url(access)
            sandbox_url.schema = access_url.schema
            sandbox_url.host = access_url.host

          # print "Overriding remote sandbox: %s" % sandbox_url

        sandbox  = saga.filesystem.Directory (sandbox_url, session=session)

        # Try to fetch a tarball of profiles, so that we can get them all in one (SAGA) go!
        PROFILES_TARBALL = '%s.prof.tgz' % pilot['uid']
        tarball_available = False
        try:
            if sandbox.is_file(PROFILES_TARBALL):
                log.warn("Profiles tarball exists")

                ftgt = saga.Url('%s/%s' % (tgt_url, PROFILES_TARBALL))

                if skip_existing and os.path.isfile(ftgt.path) \
                        and os.stat(ftgt.path).st_size > 0:

                    log.info("skip fetching of '%s/%s' to '%s'.", 
                             sandbox_url, PROFILES_TARBALL, tgt_url)
                    tarball_available = True
                else:

                    log.info("fetch '%s%s' to '%s'.", sandbox_url, 
                             PROFILES_TARBALL, tgt_url)

                    prof_file = saga.filesystem.File("%s%s" % (sandbox_url, PROFILES_TARBALL), session=session)
                    prof_file.copy(ftgt, flags=saga.filesystem.CREATE_PARENTS)
                    prof_file.close()

                    tarball_available = True
            else:
                log.warn("profiles tarball doesnt exists!")

        except saga.DoesNotExist:
            log.exception("exception(TODO): profiles tarball doesnt exists!")

        try:
            os.mkdir("%s/%s" % (tgt_url.path, pilot['uid']))
        except OSError:
            pass

        # We now have a local tarball
        if tarball_available:
            log.info("Extract tarball %s to '%s'.", ftgt.path, tgt_url.path)
            try:
                tarball = tarfile.open(ftgt.path, mode='r:gz')
                tarball.extractall("%s/%s" % (tgt_url.path, pilot['uid']))

                profiles = glob.glob("%s/%s/*.prof" % (tgt_url.path, pilot['uid']))
                ret.extend(profiles)
            except Exception as e:
                log.warn('could not extract tarball %s [%s]', ftgt.path, e)
                print 'skip %s [%s]' % (ftgt.path, e)

            # If extract succeeded, no need to fetch individual profiles
            continue

        # If we dont have a tarball (for whichever reason), fetch individual profiles
        profiles = sandbox.list('*.prof')

        for prof in profiles:

            ftgt = saga.Url('%s/%s/%s' % (tgt_url, pilot['uid'], prof))
            ret.append("%s" % ftgt.path)

            if skip_existing and os.path.isfile(ftgt.path) \
                             and os.stat(ftgt.path).st_size > 0:

                logger.report.info("\t- %s\n" % str(prof).split('/')[-1])
                continue

            logger.report.info("\t+ %s\n" % str(prof).split('/')[-1])
            prof_file = saga.filesystem.File("%s%s" % (sandbox_url, prof), session=session)
            prof_file.copy(ftgt, flags=saga.filesystem.CREATE_PARENTS)
            prof_file.close()

    return ret
Esempio n. 18
0
#!/usr/bin/env python

import os
import radical.utils as ru

dburl = 'mongodb://144.76.72.175/am'
pwd = os.path.dirname(__file__)

if __name__ == '__main__':

    mongo, db, _, _, _ = ru.mongodb_connect(str(dburl))

    sid = ru.generate_id('rp.session', mode=ru.ID_PRIVATE)
    coll = db[sid]
    json = ru.read_json('%s/01_publish_resources.json' % pwd)

    print 'create session %s' % sid

    for doc in json:
        if doc['type'] == 'session':
            doc['uid'] = sid
            doc['_id'] = sid
        coll.insert(doc)
        print 'insert %s %s' % (doc['type'], doc['uid'])
    print 'inserted session %s' % sid
def bootstrap_3():
    """
    This method continues where the bootstrapper left off, but will quickly pass
    control to the Agent class which will spawn the functional components.

    Most of bootstrap_3 applies only to agent_0, in particular all mongodb
    interactions remains excluded for other sub-agent instances.

    The agent interprets a config file, which will specify in an agent_layout
    section:
      - what nodes should be used for sub-agent startup
      - what bridges should be started
      - what components should be started
      - what are the endpoints for bridges which are not started
    bootstrap_3 will create derived config files for all sub-agents.

    The agent master (agent_0) will collect information about the nodes required
    for all instances.  That is added to the config itself, for the benefit of
    the LRMS initialisation which is expected to block those nodes from the
    scheduler.
    """

    global lrms, agent, bridges

    # find out what agent instance name we have
    if len(sys.argv) != 2:
        raise RuntimeError("invalid number of parameters (%s)" % sys.argv)
    agent_name = sys.argv[1]

    # load the agent config, and overload the config dicts
    agent_cfg = "%s/%s.cfg" % (os.getcwd(), agent_name)
    print "startup agent %s : %s" % (agent_name, agent_cfg)

    cfg = ru.read_json_str(agent_cfg)
    cfg["agent_name"] = agent_name
    pilot_id = cfg["pilot_id"]

    # set up a logger and profiler
    prof = ru.Profiler("%s.bootstrap_3" % agent_name)
    prof.prof("sync ref", msg="agent start", uid=pilot_id)
    log = ru.get_logger("%s.bootstrap_3" % agent_name, "%s.bootstrap_3.log" % agent_name, "DEBUG")  # FIXME?
    log.info("start")
    prof.prof("sync ref", msg="agent start")

    try:
        import setproctitle as spt

        spt.setproctitle("radical.pilot %s" % agent_name)
    except Exception as e:
        log.debug("no setproctitle: %s", e)

    log.setLevel(cfg.get("debug", "INFO"))

    print "Agent config (%s):\n%s\n\n" % (agent_cfg, pprint.pformat(cfg))

    # quickly set up a mongodb handle so that we can report errors.
    # FIXME: signal handlers need mongo_p, but we won't have that until later
    if agent_name == "agent_0":

        # Check for the RADICAL_PILOT_DB_HOSTPORT env var, which will hold the
        # address of the tunnelized DB endpoint.
        # If it exists, we overrule the agent config with it.
        hostport = os.environ.get("RADICAL_PILOT_DB_HOSTPORT")
        if hostport:
            dburl = ru.Url(cfg["mongodb_url"])
            dburl.host, dburl.port = hostport.split(":")
            cfg["mongodb_url"] = str(dburl)

        _, mongo_db, _, _, _ = ru.mongodb_connect(cfg["mongodb_url"])
        mongo_p = mongo_db["%s.p" % cfg["session_id"]]

        if not mongo_p:
            raise RuntimeError("could not get a mongodb handle")

    # set up signal and exit handlers
    def exit_handler():
        global lrms, agent, bridges

        print "atexit"
        if lrms:
            lrms.stop()
            lrms = None
        if bridges:
            for b in bridges:
                b.stop()
            bridges = dict()
        if agent:
            agent.stop()
            agent = None
        sys.exit(1)

    def sigint_handler(signum, frame):
        if agent_name == "agent_0":
            pilot_FAILED(msg="Caught SIGINT. EXITING (%s)" % frame)
        print "sigint"
        prof.prof("stop", msg="sigint_handler", uid=pilot_id)
        prof.close()
        sys.exit(2)

    def sigterm_handler(signum, frame):
        if agent_name == "agent_0":
            pilot_FAILED(msg="Caught SIGTERM. EXITING (%s)" % frame)
        print "sigterm"
        prof.prof("stop", msg="sigterm_handler %s" % os.getpid(), uid=pilot_id)
        prof.close()
        sys.exit(3)

    def sigalarm_handler(signum, frame):
        if agent_name == "agent_0":
            pilot_FAILED(msg="Caught SIGALRM (Walltime limit?). EXITING (%s)" % frame)
        print "sigalrm"
        prof.prof("stop", msg="sigalarm_handler", uid=pilot_id)
        prof.close()
        sys.exit(4)

    import atexit

    atexit.register(exit_handler)
    signal.signal(signal.SIGINT, sigint_handler)
    signal.signal(signal.SIGTERM, sigterm_handler)
    signal.signal(signal.SIGALRM, sigalarm_handler)

    # if anything went wrong up to this point, we would have been unable to
    # report errors into mongodb.  From here on, any fatal error should result
    # in one of the above handlers or exit handlers being activated, thus
    # reporting the error dutifully.

    try:
        # ----------------------------------------------------------------------
        # des Pudels Kern: merge LRMS info into cfg and get the agent started

        if agent_name == "agent_0":

            # only the master agent creates LRMS and sub-agent config files.
            # The LRMS which will give us the set of agent_nodes to use for
            # sub-agent startup.  Add the remaining LRMS information to the
            # config, for the benefit of the scheduler).

            lrms = rp.agent.RM.create(name=cfg["lrms"], cfg=cfg, logger=log)
            cfg["lrms_info"] = lrms.lrms_info

            # the master agent also is the only one which starts bridges.  It
            # has to do so before creating the Agent Worker instance, as that is
            # using the bridges already.

            bridges = start_bridges(cfg, log)
            # FIXME: make sure all communication channels are in place.  This could
            # be replaced with a proper barrier, but not sure if that is worth it...
            time.sleep(1)

            # after we started bridges, we'll add their in and out addresses
            # to the config, so that the communication channels can connect to
            # them.  At this point we also write configs for all sub-agents this
            # instance intents to spawn.
            #
            # FIXME: we should point the address to the node of the subagent
            #        which hosts the bridge, not the local IP.  Until this
            #        is fixed, bridges MUST run on agent_0 (which is what
            #        RM.hostip() below will point to).
            nodeip = rp.agent.RM.hostip(cfg.get("network_interface"), logger=log)
            write_sub_configs(cfg, bridges, nodeip, log)

            # Store some runtime information into the session
            mongo_p.update(
                {"_id": pilot_id},
                {"$set": {"lm_info": lrms.lm_info.get("version_info"), "lm_detail": lrms.lm_info.get("lm_detail")}},
            )

        # we now have correct bridge addresses added to the agent_0.cfg, and all
        # other agents will have picked that up from their config files -- we
        # can start the agent and all its components!
        agent = rp.worker.Agent(cfg)
        agent.start()

        log.debug("waiting for agent %s to join" % agent_name)
        agent.join()
        log.debug("agent %s joined" % agent_name)

        # ----------------------------------------------------------------------

    except SystemExit:
        log.exception("Exit running agent: %s" % agent_name)
        if agent and not agent.final_cause:
            agent.final_cause = "sys.exit"

    except Exception as e:
        log.exception("Error running agent: %s" % agent_name)
        if agent and not agent.final_cause:
            agent.final_cause = "error"

    finally:

        # in all cases, make sure we perform an orderly shutdown.  I hope python
        # does not mind doing all those things in a finally clause of
        # (essentially) main...
        if agent:
            agent.stop()
        log.debug("agent %s finalized" % agent_name)

        # agent.stop will not tear down bridges -- we do that here at last
        for name, b in bridges.items():
            try:
                log.info("closing bridge %s", b)
                b["handle"].stop()
            except Exception as e:
                log.exception("ignore failing bridge terminate (%s)", e)
        bridges = dict()

        # make sure the lrms release whatever it acquired
        if lrms:
            lrms.stop()
            lrms = None

        # agent_0 will also report final pilot state to the DB
        if agent_name == "agent_0":
            if agent and agent.final_cause == "timeout":
                pilot_DONE(mongo_p, pilot_id, log, "TIMEOUT received. Terminating.")
            elif agent and agent.final_cause == "cancel":
                pilot_CANCELED(mongo_p, pilot_id, log, "CANCEL received. Terminating.")
            elif agent and agent.final_cause == "sys.exit":
                pilot_CANCELED(mongo_p, pilot_id, log, "EXIT received. Terminating.")
            elif agent and agent.final_cause == "finalize":
                log.info("shutdown due to component finalization -- assuming error")
                pilot_FAILED(mongo_p, pilot_id, log, "FINALIZE received")
            elif agent:
                pilot_FAILED(mongo_p, pilot_id, log, "TERMINATE received")
            else:
                pilot_FAILED(mongo_p, pilot_id, log, "FAILED startup")

        log.info("stop")
        prof.prof("stop", msg="finally clause agent", uid=pilot_id)
        prof.close()
Esempio n. 20
0
def fetch_profiles (sid, dburl=None, src=None, tgt=None, access=None, 
        session=None, skip_existing=False, fetch_client=False, log=None):
    '''
    sid: session for which all profiles are fetched
    src: dir to look for client session profiles ($src/$sid/*.prof)
    tgt: dir to store the profile in
         - $tgt/$sid/*.prof,
         - $tgt/$sid/$pilot_id/*.prof)

    returns list of file names
    '''

    if not log and session:
        log = session._log
        rep = session._rep
    elif not log:
        log = ru.Logger('radical.pilot.utils')
        rep = ru.Reporter('radical.pilot.utils')

    ret = list()

    if not dburl:
        dburl = os.environ['RADICAL_PILOT_DBURL']

    if not dburl:
        raise ValueError('RADICAL_PILOT_DBURL is not set')

    if not src:
        src = os.getcwd()

    if not tgt:
        tgt = os.getcwd()

    if not tgt.startswith('/') and '://' not in tgt:
        tgt = "%s/%s" % (os.getcwd(), tgt)

    # we always create a session dir as real target
    tgt_url = rs.Url("%s/%s/" % (tgt, sid))

    # Turn URLs without schema://host into file://localhost,
    # so that they dont become interpreted as relative.
    if not tgt_url.schema:
        tgt_url.schema = 'file'
    if not tgt_url.host:
        tgt_url.host = 'localhost'

    # first fetch session profile
    if fetch_client:
        client_profiles = glob.glob("%s/%s/*.prof" % (src, sid))
        if not client_profiles:
            raise RuntimeError('no client profiles in %s/%s' % (src, sid))

        for client_profile in client_profiles:

            ftgt = rs.Url('%s/%s' % (tgt_url, os.path.basename(client_profile)))
            ret.append("%s" % ftgt.path)

            if skip_existing and os.path.isfile(ftgt.path) \
                    and os.stat(ftgt.path).st_size > 0:
                pass
            else:
                prof_file = rs.fs.File(client_profile, session=session)
                prof_file.copy(ftgt, flags=rs.fs.CREATE_PARENTS)
                prof_file.close()

            if not os.path.isfile(client_profile):
                raise RuntimeError('client profilefile %s does not exist' % client_profile)

    _, db, _, _, _ = ru.mongodb_connect (dburl)

    json_docs = get_session_docs(db, sid)

    pilots = json_docs['pilot']
    num_pilots = len(pilots)
    log.debug("Session: %s", sid)
    log.debug("Number of pilots in session: %d", num_pilots)

    for pilot in pilots:

        try:
            log.debug("processing pilot '%s'", pilot['uid'])

            sandbox_url = rs.Url(pilot['pilot_sandbox'])

            if access:
                # Allow to use a different access schema than used for the the run.
                # Useful if you ran from the headnode, but would like to retrieve
                # the profiles to your desktop (Hello Titan).
                access_url = rs.Url(access)
                sandbox_url.schema = access_url.schema
                sandbox_url.host   = access_url.host

              # print "Overriding remote sandbox: %s" % sandbox_url

            sandbox = rs.fs.Directory (sandbox_url, session=session)

            # Try to fetch a tarball of profiles, so that we can get them all in one (SAGA) go!
            PROFILES_TARBALL = '%s.prof.tgz' % pilot['uid']
            tarball_available = False
            try:
                if  sandbox.is_file(PROFILES_TARBALL) and \
                    sandbox.get_size(PROFILES_TARBALL):

                    log.info("profiles tarball exists")
                    ftgt = rs.Url('%s/%s' % (tgt_url, PROFILES_TARBALL))

                    if skip_existing and os.path.isfile(ftgt.path) \
                            and os.stat(ftgt.path).st_size > 0:

                        log.info("skip fetching of '%s/%s' to '%s'.", 
                                 sandbox_url, PROFILES_TARBALL, tgt_url)
                        tarball_available = True
                    else:

                        log.info("fetch '%s%s' to '%s'.", sandbox_url, 
                                 PROFILES_TARBALL, tgt_url)

                        prof_file = rs.fs.File("%s%s" % (sandbox_url,
                                            PROFILES_TARBALL), session=session)
                        prof_file.copy(ftgt, flags=rs.fs.CREATE_PARENTS)
                        prof_file.close()

                        tarball_available = True
                else:
                    log.warn("profiles tarball doesnt exists!")

            except rs.DoesNotExist:
                log.exception("exception(TODO): profiles tarball doesnt exists!")

            try:
                os.mkdir("%s/%s" % (tgt_url.path, pilot['uid']))
            except OSError:
                pass

            # We now have a local tarball
            if tarball_available:
                log.info("Extract tarball %s to '%s'.", ftgt.path, tgt_url.path)
                try:
                    tarball = tarfile.open(ftgt.path, mode='r:gz')
                    tarball.extractall("%s/%s" % (tgt_url.path, pilot['uid']))

                    profiles = glob.glob("%s/%s/*.prof" % (tgt_url.path, pilot['uid']))
                    ret.extend(profiles)
                    os.unlink(ftgt.path)

                    # If extract succeeded, no need to fetch individual profiles
                    rep.ok("+ %s (profiles)\n" % pilot['uid'])
                    continue

                except Exception as e:
                    log.warn('could not extract tarball %s [%s]', ftgt.path, e)

            # If we dont have a tarball (for whichever reason), fetch individual profiles
            profiles = sandbox.list('*.prof')
            for prof in profiles:

                ftgt = rs.Url('%s/%s/%s' % (tgt_url, pilot['uid'], prof))
                ret.append("%s" % ftgt.path)

                if skip_existing and os.path.isfile(ftgt.path) \
                                 and os.stat(ftgt.path).st_size > 0:
                    pass
                else:
                    prof_file = rs.fs.File("%s%s" % (sandbox_url, prof), session=session)
                    prof_file.copy(ftgt, flags=rs.fs.CREATE_PARENTS)
                    prof_file.close()

            rep.ok("+ %s (profiles)\n" % pilot['uid'])

        except Exception as e:
            rep.error("- %s (profiles)\n" % pilot['uid'])
            log.exception('failed to fet profile for %s', pilot['uid'])

    return ret
Esempio n. 21
0
    def __init__(self, sid, dburl, cfg, log, connect=True):
        """
        Creates a new session

        A session is a MongoDB collection which contains documents of
        different types:

        session : document describing this rp.Session (singleton)
        pmgr    : document describing a rp.PilotManager
        pilots  : document describing a rp.Pilot
        umgr    : document describing a rp.UnitManager
        units   : document describing a rp.Unit
        """

        self._dburl = dburl
        self._log = log
        self._mongo = None
        self._db = None
        self._created = time.time()
        self._connected = None
        self._closed = None
        self._c = None
        self._can_remove = False

        if not connect:
            return

        # mpongodb_connect wants a string at the moment
        self._mongo, self._db, _, _, _ = ru.mongodb_connect(str(dburl))

        if not self._mongo or not self._db:
            raise RuntimeError('Could not connect to database at %s' % dburl)

        self._connected = time.time()

        self._c = self._db[sid]  # creates collection (lazily)

        # If session exists, we assume this is a reconnect, otherwise we create
        # the session entry.
        # NOTE: hell will break loose if session IDs are not unique!
        if not self._c.count():

            # make 'uid', 'type' and 'state' indexes, as we frequently query
            # based on combinations of those.  Only 'uid' is unique
            pma = pymongo.ASCENDING
            self._c.create_index([('uid', pma)], unique=True, sparse=False)
            self._c.create_index([('type', pma)], unique=False, sparse=False)
            self._c.create_index([('state', pma)], unique=False, sparse=False)

            # insert the session doc
            self._can_delete = True
            self._c.insert({
                'type': 'session',
                '_id': sid,
                'uid': sid,
                'cfg': copy.deepcopy(cfg),
                'created': self._created,
                'connected': self._connected
            })
            self._can_remove = True
        else:
            docs = self._c.find({'type': 'session', 'uid': sid})
            if not docs.count():
                raise ValueError('cannot reconnect to session %s' % sid)

            doc = docs[0]
            self._can_delete = False
            self._created = doc['created']
            self._connected = time.time()
Esempio n. 22
0
    def __init__(self, sid, dburl, cfg, logger, connect=True):
        """ 
        Creates a new session

        A session is a MongoDB collection which contains documents of
        different types:

        session : document describing this rp.Session (singleton)
        pmgr    : document describing a rp.PilotManager 
        pilots  : document describing a rp.Pilot
        umgr    : document describing a rp.UnitManager
        units   : document describing a rp.Unit
        """

        self._dburl      = dburl
        self._log        = logger
        self._mongo      = None
        self._db         = None
        self._created    = time.time()
        self._connected  = None
        self._closed     = None
        self._c          = None
        self._can_remove = False

        if not connect:
            return

        # mpongodb_connect wants a string at the moment
        self._mongo, self._db, _, _, _ = ru.mongodb_connect(str(dburl))

        if not self._mongo or not self._db:
            raise RuntimeError('Could not connect to database at %s' % dburl)

        self._connected = time.time()

        self._c = self._db[sid] # creates collection (lazily)

        # If session exists, we assume this is a reconnect, otherwise we create
        # the session entry.
        # NOTE: hell will break loose if session IDs are not unique!
        if not self._c.count():

            # make 'uid', 'type' and 'state' indexes, as we frequently query
            # based on combinations of those.  Only 'uid' is unique
            self._c.create_index([('uid',   pymongo.ASCENDING)], unique=True,  sparse=False)
            self._c.create_index([('type',  pymongo.ASCENDING)], unique=False, sparse=False)
            self._c.create_index([('state', pymongo.ASCENDING)], unique=False, sparse=False)

            # insert the session doc
            self._can_delete = True
            self._c.insert({'type'      : 'session',
                            '_id'       : sid,
                            'uid'       : sid,
                            'cfg'       : copy.deepcopy(cfg),
                            'created'   : self._created,
                            'connected' : self._connected})
            self._can_remove = True
        else:
            docs = self._c.find({'type' : 'session', 
                                 'uid'  : sid})
            if not docs.count():
                raise ValueError('cannot reconnect to session %s' % sid)

            doc = docs[0]
            self._can_delete = False
            self._created    = doc['created']
            self._connected  = time.time()
    plt.savefig(ofile)


# -----------------------------------------------------------------------------
#
if __name__ == '__main__':

    session = None
    q_pilots = None
    timing = None
    timings = {}
    #pdir = os.environ['PLOT_DIR']

    cachedir = os.getcwd()
    dburl = 'mongodb://*****:*****@ds053838.mongolab.com:53838/hicomb'
    mongo, db, dbname, cname, pname = ru.mongodb_connect(str(dburl))
    
    if len(sys.argv) <= 1:
        usage("insufficient arguments -- need session ID")

    if len(sys.argv) > 4:
        usage("too many arguments -- no more than 3")

    if len(sys.argv[1]) < 20:
        usage("illegal session token -- valid e.g. 54b1c5d523769c2f1b55dffd")
    else:
        session = sys.argv[1]

    if len(sys.argv) > 2:
        timing = sys.argv[2]
Esempio n. 24
0
def fetch_profiles(sid,
                   dburl=None,
                   client=None,
                   tgt=None,
                   access=None,
                   session=None,
                   skip_existing=False):
    '''
    sid: session for which all profiles are fetched
    client: dir to look for client session profiles
    tgt: dir to store the profile in

    returns list of file names
    '''

    ret = list()

    if not dburl:
        dburl = os.environ['RADICAL_PILOT_DBURL']

    if not dburl:
        raise RuntimeError('Please set RADICAL_PILOT_DBURL')

    if not client:
        client = os.getcwd()

    if not tgt:
        tgt = os.getcwd()

    if not tgt.startswith('/') and '://' not in tgt:
        tgt = "%s/%s" % (os.getcwd(), tgt)

    # we always create a session dir as real target
    tgt_url = saga.Url("%s/%s/" % (tgt, sid))

    # Turn URLs without schema://host into file://localhost,
    # so that they dont become interpreted as relative.
    if not tgt_url.schema:
        tgt_url.schema = 'file'
    if not tgt_url.host:
        tgt_url.host = 'localhost'

    # first fetch session profile
    # FIXME: should we record pwd or profile location in db session?  Or create
    #        a sandbox like dir for storing profiles and logs?
    client_profile = "%s/%s.prof" % (client, sid)

    ftgt = saga.Url('%s/%s' % (tgt_url, os.path.basename(client_profile)))
    ret.append("%s" % ftgt.path)

    if skip_existing and os.path.isfile(ftgt.path) \
            and os.stat(ftgt.path).st_size > 0:

        logger.report.info("\t- %s\n" % client_profile.split('/')[-1])

    else:

        logger.report.info("\t+ %s\n" % client_profile.split('/')[-1])
        prof_file = saga.filesystem.File(client_profile, session=session)
        prof_file.copy(ftgt, flags=saga.filesystem.CREATE_PARENTS)
        prof_file.close()

    _, db, _, _, _ = ru.mongodb_connect(dburl)

    json_docs = get_session_docs(db, sid)

    pilots = json_docs['pilot']
    num_pilots = len(pilots)
    #  print "Session: %s" % sid
    #  print "Number of pilots in session: %d" % num_pilots

    for pilot in pilots:

        # print "Processing pilot '%s'" % pilot['_id']

        sandbox_url = saga.Url(pilot['sandbox'])

        if access:
            # Allow to use a different access scheme than used for the the run.
            # Useful if you ran from the headnode, but would like to retrieve
            # the profiles to your desktop (Hello Titan).
            access_url = saga.Url(access)
            sandbox_url.schema = access_url.schema
            sandbox_url.host = access_url.host

        # print "Overriding remote sandbox: %s" % sandbox_url

        sandbox = saga.filesystem.Directory(sandbox_url, session=session)

        # Try to fetch a tarball of profiles, so that we can get them all in one (SAGA) go!
        PROFILES_TARBALL = '%s.prof.tgz' % pilot['_id']
        tarball_available = False
        try:
            if sandbox.is_file(PROFILES_TARBALL):
                print "Profiles tarball exists!"

                ftgt = saga.Url('%s/%s' % (tgt_url, PROFILES_TARBALL))

                if skip_existing and os.path.isfile(ftgt.path) \
                        and os.stat(ftgt.path).st_size > 0:

                    print "Skipping fetching of '%s/%s' to '%s'." % (
                        sandbox_url, PROFILES_TARBALL, tgt_url)
                    tarball_available = True
                else:

                    print "Fetching '%s%s' to '%s'." % (
                        sandbox_url, PROFILES_TARBALL, tgt_url)
                    prof_file = saga.filesystem.File(
                        "%s%s" % (sandbox_url, PROFILES_TARBALL),
                        session=session)
                    prof_file.copy(ftgt, flags=saga.filesystem.CREATE_PARENTS)
                    prof_file.close()

                    tarball_available = True
            else:
                print "Profiles tarball doesnt exists!"

        except saga.DoesNotExist:
            print "exception(TODO): profiles tarball doesnt exists!"

        try:
            os.mkdir("%s/%s" % (tgt_url.path, pilot['_id']))
        except OSError:
            pass

        # We now have a local tarball
        if tarball_available:
            print "Extracting tarball %s into '%s'." % (ftgt.path,
                                                        tgt_url.path)
            tarball = tarfile.open(ftgt.path)
            tarball.extractall("%s/%s" % (tgt_url.path, pilot['_id']))

            profiles = glob.glob("%s/*.prof" % tgt_url.path)
            print "Tarball %s extracted to '%s/%s/'." % (
                ftgt.path, tgt_url.path, pilot['_id'])
            ret.extend(profiles)

            # If extract succeeded, no need to fetch individual profiles
            continue

        # If we dont have a tarball (for whichever reason), fetch individual profiles
        profiles = sandbox.list('*.prof')

        for prof in profiles:

            ftgt = saga.Url('%s/%s/%s' % (tgt_url, pilot['_id'], prof))
            ret.append("%s" % ftgt.path)

            if skip_existing and os.path.isfile(ftgt.path) \
                             and os.stat(ftgt.path).st_size > 0:

                logger.report.info("\t- %s\n" % str(prof).split('/')[-1])
                continue

            logger.report.info("\t+ %s\n" % str(prof).split('/')[-1])
            prof_file = saga.filesystem.File("%s%s" % (sandbox_url, prof),
                                             session=session)
            prof_file.copy(ftgt, flags=saga.filesystem.CREATE_PARENTS)
            prof_file.close()

    return ret