Beispiel #1
0
class Session(rs.Session):
    """
    A Session encapsulates a RADICAL-Pilot instance and is the *root* object

    A Session holds :class:`radical.pilot.PilotManager` and
    :class:`radical.pilot.UnitManager` instances which in turn hold
    :class:`radical.pilot.ComputePilot` and :class:`radical.pilot.ComputeUnit`
    instances.
    """

    # the reporter is an applicataion-level singleton
    _reporter = None

    # We keep a static typemap for component startup. If we ever want to
    # become reeeealy fancy, we can derive that typemap from rp module
    # inspection.
    #
    # --------------------------------------------------------------------------
    #
    def __init__(self, dburl=None, uid=None, cfg=None, _connect=True):
        """
        Creates a new session.  A new Session instance is created and 
        stored in the database.

        **Arguments:**
            * **dburl** (`string`): The MongoDB URL.  If none is given,
              RP uses the environment variable RADICAL_PILOT_DBURL.  If that is
              not set, an error will be raises.

            * **uid** (`string`): Create a session with this UID.  
              *Only use this when you know what you are doing!*

        **Returns:**
            * A new Session instance.

        **Raises:**
            * :class:`radical.pilot.DatabaseError`

        """

        if os.uname()[0] == 'Darwin':
            # on MacOS, we are running out of file descriptors soon.  The code
            # below attempts to increase the limit of open files - but any error
            # is silently ignored, so this is an best-effort, no guarantee.  We
            # leave responsibility for system limits with the user.
            try:
                import resource
                limits = list(resource.getrlimit(resource.RLIMIT_NOFILE))
                limits[0] = 512
                resource.setrlimit(resource.RLIMIT_NOFILE, limits)
            except:
                pass

        self._dh = ru.DebugHelper()
        self._valid = True
        self._closed = False
        self._valid_iter = 0  # detect recursive calls of `is_valid()`

        # class state
        self._dbs = None
        self._uid = None
        self._dburl = None
        self._reconnected = False

        self._cache = dict()  # cache sandboxes etc.
        self._cache_lock = threading.RLock()

        self._cache['resource_sandbox'] = dict()
        self._cache['session_sandbox'] = dict()
        self._cache['pilot_sandbox'] = dict()

        # before doing anything else, set up the debug helper for the lifetime
        # of the session.
        self._debug_helper = ru.DebugHelper()

        # Dictionaries holding all manager objects created during the session.
        # NOTE: should this also include agents?
        self._pmgrs = dict()
        self._umgrs = dict()
        self._bridges = list()
        self._components = list()

        # FIXME: we work around some garbage collection issues we don't yet
        #        understand: instead of relying on the GC to eventually collect
        #        some stuff, we actively free those on `session.close()`, at
        #        least for the current process.  Usually, all resources get
        #        nicely collected on process termination - but not when we
        #        create many sessions (one after the other) in the same
        #        application instance (ie. the same process).  This workarounf
        #        takes care of that use case.
        #        The clean solution would be to ensure clean termination
        #        sequence, something which I seem to be unable to implement...
        #        :/
        self._to_close = list()
        self._to_stop = list()
        self._to_destroy = list()

        # cache the client sandbox
        # FIXME: this needs to be overwritten if configured differently in the
        #        session config, as should be the case for any agent side
        #        session instance.
        self._client_sandbox = os.getcwd()

        # The resource configuration dictionary associated with the session.
        self._resource_configs = {}

        # if a config is given, us its values:
        if cfg:
            self._cfg = copy.deepcopy(cfg)
        else:
            # otherwise we need a config
            self._cfg = ru.read_json("%s/configs/session_%s.json" \
                    % (os.path.dirname(__file__),
                       os.environ.get('RADICAL_PILOT_SESSION_CFG', 'default')))

        # fall back to config data where possible
        # sanity check on parameters
        if not uid:
            uid = self._cfg.get('session_id')

        if uid:
            self._uid = uid
            self._reconnected = True
        else:
            # generate new uid, reset all other ID counters
            # FIXME: this will screw up counters for *concurrent* sessions,
            #        as the ID generation is managed in a process singleton.
            self._uid = ru.generate_id('rp.session', mode=ru.ID_PRIVATE)
            ru.reset_id_counters(prefix='rp.session', reset_all_others=True)

        if not self._cfg.get('session_id'): self._cfg['session_id'] = self._uid
        if not self._cfg.get('owner'): self._cfg['owner'] = self._uid
        if not self._cfg.get('logdir'):            self._cfg['logdir']     = '%s/%s' \
                     % (os.getcwd(), self._uid)

        self._logdir = self._cfg['logdir']
        self._prof = self._get_profiler(name=self._cfg['owner'])
        self._rep = self._get_reporter(name=self._cfg['owner'])
        self._log = self._get_logger(name=self._cfg['owner'],
                                     level=self._cfg.get('debug'))

        if _connect:
            # we need a dburl to connect to.

            if not dburl:
                dburl = os.environ.get("RADICAL_PILOT_DBURL")

            if not dburl:
                dburl = self._cfg.get('default_dburl')

            if not dburl:
                dburl = self._cfg.get('dburl')

            if not dburl:
                # we forgive missing dburl on reconnect, but not otherwise
                raise RuntimeError("no database URL (set RADICAL_PILOT_DBURL)")

        self._dburl = ru.Url(dburl)
        self._cfg['dburl'] = str(self._dburl)

        # now we have config and uid - initialize base class (saga session)
        rs.Session.__init__(self, uid=self._uid)

        # ----------------------------------------------------------------------
        # create new session
        if _connect:
            self._log.info("using database %s" % self._dburl)

            # if the database url contains a path element, we interpret that as
            # database name (without the leading slash)
            if  not self._dburl.path         or \
                self._dburl.path[0]   != '/' or \
                len(self._dburl.path) <=  1  :
                if not uid:
                    # we fake reconnnect if no DB is available -- but otherwise we
                    # really really need a db connection...
                    raise ValueError("incomplete DBURL '%s' no db name!" %
                                     self._dburl)

        if not self._reconnected:
            self._prof.prof('session_start', uid=self._uid)
            self._rep.info('<<new session: ')
            self._rep.plain('[%s]' % self._uid)
            self._rep.info('<<database   : ')
            self._rep.plain('[%s]' % self._dburl)

        self._load_resource_configs()

        self._rec = os.environ.get('RADICAL_PILOT_RECORD_SESSION')
        if self._rec:
            # NOTE: Session recording cannot handle reconnected sessions, yet.
            #       We thus turn it off here with a warning
            if self._reconnected:
                self._log.warn("no session recording on reconnected session")

            else:
                # append session ID to recording path
                self._rec = "%s/%s" % (self._rec, self._uid)

                # create recording path and record session
                os.system('mkdir -p %s' % self._rec)
                ru.write_json({'dburl': str(self.dburl)},
                              "%s/session.json" % self._rec)
                self._log.info("recording session in %s" % self._rec)

        # create/connect database handle
        try:
            self._dbs = DBSession(sid=self.uid,
                                  dburl=str(self._dburl),
                                  cfg=self._cfg,
                                  logger=self._log,
                                  connect=_connect)

            # from here on we should be able to close the session again
            self._log.info("New Session created: %s." % self.uid)

        except Exception, ex:
            self._rep.error(">>err\n")
            self._log.exception('session create failed')
            raise RuntimeError("Couldn't create new session (database URL '%s' incorrect?): %s" \
                            % (dburl, ex))

        # the session must not carry bridge and component handles across forks
        ru.atfork(self._atfork_prepare, self._atfork_parent,
                  self._atfork_child)

        # if bridges and components are specified in the config, start them
        ruc = rpu.Component
        self._bridges = ruc.start_bridges(self._cfg, self, self._log)
        self._components = ruc.start_components(self._cfg, self, self._log)
        self.is_valid()

        # FIXME: make sure the above code results in a usable session on
        #        reconnect
        self._rep.ok('>>ok\n')
    def initialize_common(self):

        # the manager must not carry bridge and component handles across forks
        ru.atfork(self._atfork_prepare, self._atfork_parent, self._atfork_child)
Beispiel #3
0
    def initialize_common(self):

        # the manager must not carry bridge and component handles across forks
        ru.atfork(self._atfork_prepare, self._atfork_parent, self._atfork_child)