Esempio n. 1
0
    def call_unit_state_callbacks(self, unit_id, new_state):
        """Wrapper function to call all all relevant callbacks, on unit-level
        as well as manager-level.
        """

        # this is the point where, at the earliest, the application could have
        # been notified about unit state changes.  So we record that event.
        if not unit_id in self._callback_histories:
            self._callback_histories[unit_id] = list()
        self._callback_histories[unit_id].append({
            'timestamp':
            datetime.datetime.utcnow(),
            'state':
            new_state
        })

        for [cb, cb_data] in self._shared_data[unit_id]['callbacks']:
            try:

                if self._shared_data[unit_id]['facade_object']:
                    if cb_data:
                        cb(self._shared_data[unit_id]['facade_object'],
                           new_state, cb_data)
                    else:
                        cb(self._shared_data[unit_id]['facade_object'],
                           new_state)
                else:
                    logger.error("Couldn't call callback (no pilot instance)")
            except Exception as e:
                logger.exception("Couldn't call callback function %s" % e)
                raise

        # If we have any manager-level callbacks registered, we
        # call those as well!
        if not UNIT_STATE in self._manager_callbacks:
            self._manager_callbacks[UNIT_STATE] = list()

        for [cb, cb_data] in self._manager_callbacks[UNIT_STATE]:
            if not self._shared_data[unit_id]['facade_object']:
                logger.warning('skip cb for incomple unit (%s: %s)' %
                               (unit_id, new_state))
                break

            try:
                if cb_data:
                    cb(self._shared_data[unit_id]['facade_object'], new_state,
                       cb_data)
                else:
                    cb(self._shared_data[unit_id]['facade_object'], new_state)
            except Exception as e:
                logger.exception("Couldn't call callback function %s" % e)
                raise

        # If we meet a final state, we record the object's callback history for
        # later evaluation.
        if new_state in (DONE, FAILED, CANCELED):
            self._db.publish_compute_unit_callback_history(
                unit_id, self._callback_histories[unit_id])
Esempio n. 2
0
    def close(self, cleanup=True, terminate=True, delete=None):
        """Closes the session.

        All subsequent attempts access objects attached to the session will 
        result in an error. If cleanup is set to True (default) the session
        data is removed from the database.

        **Arguments:**
            * **cleanup** (`bool`): Remove session from MongoDB (implies * terminate)
            * **terminate** (`bool`): Shut down all pilots associated with the session. 

        **Raises:**
            * :class:`radical.pilot.IncorrectState` if the session is closed
              or doesn't exist. 
        """

        logger.debug("session %s closing" % (str(self._uid)))

        uid = self._uid

        if not self._uid:
            logger.error("Session object already closed.")
            return

        # we keep 'delete' for backward compatibility.  If it was set, and the
        # other flags (cleanup, terminate) are as defaulted (True), then delete
        # will supercede them.  Delete is considered deprecated though, and
        # we'll thus issue a warning.
        if delete != None:

            if cleanup == True and terminate == True:
                cleanup = delete
                terminate = delete
                logger.warning("'delete' flag on session is deprecated. " \
                               "Please use 'cleanup' and 'terminate' instead!")

        if cleanup:
            # cleanup implies terminate
            terminate = True

        for pmgr in self._pilot_manager_objects:
            logger.debug("session %s closes   pmgr   %s" %
                         (str(self._uid), pmgr._uid))
            pmgr.close(terminate=terminate)
            logger.debug("session %s closed   pmgr   %s" %
                         (str(self._uid), pmgr._uid))

        for umgr in self._unit_manager_objects:
            logger.debug("session %s closes   umgr   %s" %
                         (str(self._uid), umgr._uid))
            umgr.close()
            logger.debug("session %s closed   umgr   %s" %
                         (str(self._uid), umgr._uid))

        if cleanup:
            self._destroy_db_entry()

        logger.debug("session %s closed" % (str(self._uid)))
Esempio n. 3
0
    def close(self, cleanup=True, terminate=True, delete=None):
        """Closes the session.

        All subsequent attempts access objects attached to the session will 
        result in an error. If cleanup is set to True (default) the session
        data is removed from the database.

        **Arguments:**
            * **cleanup** (`bool`): Remove session from MongoDB (implies * terminate)
            * **terminate** (`bool`): Shut down all pilots associated with the session. 

        **Raises:**
            * :class:`radical.pilot.IncorrectState` if the session is closed
              or doesn't exist. 
        """

        logger.debug("session %s closing" % (str(self._uid)))

        uid = self._uid

        if not self._uid:
            logger.error("Session object already closed.")
            return

        # we keep 'delete' for backward compatibility.  If it was set, and the
        # other flags (cleanup, terminate) are as defaulted (True), then delete
        # will supercede them.  Delete is considered deprecated though, and
        # we'll thus issue a warning.
        if  delete != None:

            if  cleanup == True and terminate == True :
                cleanup   = delete
                terminate = delete
                logger.warning("'delete' flag on session is deprecated. " \
                               "Please use 'cleanup' and 'terminate' instead!")

        if  cleanup :
            # cleanup implies terminate
            terminate = True

        for pmgr in self._pilot_manager_objects:
            logger.debug("session %s closes   pmgr   %s" % (str(self._uid), pmgr._uid))
            pmgr.close (terminate=terminate)
            logger.debug("session %s closed   pmgr   %s" % (str(self._uid), pmgr._uid))

        for umgr in self._unit_manager_objects:
            logger.debug("session %s closes   umgr   %s" % (str(self._uid), umgr._uid))
            umgr.close()
            logger.debug("session %s closed   umgr   %s" % (str(self._uid), umgr._uid))

        if  cleanup :
            self._destroy_db_entry()

        logger.debug("session %s closed" % (str(self._uid)))
    def call_unit_state_callbacks(self, unit_id, new_state):
        """Wrapper function to call all all relevant callbacks, on unit-level
        as well as manager-level.
        """

        # this is the point where, at the earliest, the application could have
        # been notified about unit state changes.  So we record that event.
        if  not unit_id in self._callback_histories :
            self._callback_histories[unit_id] = list()
        self._callback_histories[unit_id].append (
                {'timestamp' : datetime.datetime.utcnow(), 
                 'state'     : new_state})

        for [cb, cb_data] in self._shared_data[unit_id]['callbacks']:
            try:

                if self._shared_data[unit_id]['facade_object'] :
                    if  cb_data :
                        cb(self._shared_data[unit_id]['facade_object'], new_state, cb_data)
                    else :
                        cb(self._shared_data[unit_id]['facade_object'], new_state)
                else :
                    logger.error("Couldn't call callback (no pilot instance)")
            except Exception as e:
                logger.exception(
                    "Couldn't call callback function %s" % e)
                raise

        # If we have any manager-level callbacks registered, we
        # call those as well!
        if  not UNIT_STATE in self._manager_callbacks :
            self._manager_callbacks[UNIT_STATE] = list()

        for [cb, cb_data] in self._manager_callbacks[UNIT_STATE]:
            if not self._shared_data[unit_id]['facade_object'] :
                logger.warning ('skip cb for incomple unit (%s: %s)' % (unit_id, new_state))
                break

            try:
                if  cb_data :
                    cb(self._shared_data[unit_id]['facade_object'], new_state, cb_data)
                else :
                    cb(self._shared_data[unit_id]['facade_object'], new_state)
            except Exception as e:
                logger.exception(
                    "Couldn't call callback function %s" % e)
                raise

        # If we meet a final state, we record the object's callback history for
        # later evaluation.
        if  new_state in (DONE, FAILED, CANCELED) :
            self._db.publish_compute_unit_callback_history (unit_id, self._callback_histories[unit_id])
Esempio n. 5
0
    def close(self, terminate=True):
        """Shuts down the PilotManager and its background workers in a 
        coordinated fashion.

        **Arguments:**

            * **terminate** [`bool`]: If set to True, all active pilots will 
              get canceled (default: False).

        """

        logger.debug("pmgr    %s closing" % (str(self._uid)))

        # Spit out a warning in case the object was already closed.
        if not self._uid:
            logger.error("PilotManager object already closed.")
            return

        # before we terminate pilots, we have to kill the pilot launcher threads
        # -- otherwise we'll run into continous race conditions due to the
        # ongoing state checks...
        if self._worker is not None:
            # Stop the worker process
            logger.debug("pmgr    %s cancel   worker %s" %
                         (str(self._uid), self._worker.name))
            self._worker.cancel_launcher()
            logger.debug("pmgr    %s canceled worker %s" %
                         (str(self._uid), self._worker.name))

        # If terminate is set, we cancel all pilots.
        if terminate:
            # cancel all pilots, make sure they are gone, and close the pilot
            # managers.
            for pilot in self.get_pilots():
                logger.debug("pmgr    %s cancels  pilot  %s" %
                             (str(self._uid), pilot._uid))
            self.cancel_pilots()

            # FIXME:
            #
            # wait_pilots() will wait until all pilots picked up the sent cancel
            # signal and died.  However, that can take a loooong time.  For
            # example, if a pilot is in 'PENDING_ACTIVE' state, this will have to
            # wait until the pilot is bootstrapped, started, connected to the DB,
            # and shut down again.  Or, for a pilot which just got a shitload of
            # units, it will have to wait until the pilot started all those units
            # and then checks its command queue again.  Or, if the pilot job
            # already died, wait will block until the state checker kicks in and
            # declares the pilot as dead, which takes a couple of minutes.
            #
            # Solution would be to add a CANCELING state and to wait for that one,
            # too, which basically means to wait until the cancel signal has been
            # sent.  There is not much more to do at this point anyway.  This is at
            # the moment faked in the manager controler, which sets that state
            # after sending the cancel command.  This should be converted into
            # a proper state -- that would, btw, remove the need for a cancel
            # command in the first place, as the pilot can just pull its own state
            # instead, and cancel on CANCELING...
            #
            # self.wait_pilots ()
            wait_for_cancel = True
            all_pilots = self.get_pilots()
            while wait_for_cancel:
                wait_for_cancel = False
                for pilot in all_pilots:
                    logger.debug("pmgr    %s wait for pilot  %s (%s)" %
                                 (str(self._uid), pilot._uid, pilot.state))
                    if pilot.state not in [DONE, FAILED, CANCELED, CANCELING]:
                        time.sleep(1)
                        wait_for_cancel = True
                        break
            for pilot in self.get_pilots():
                logger.debug("pmgr    %s canceled pilot  %s" %
                             (str(self._uid), pilot._uid))

        logger.debug("pmgr    %s stops    worker %s" %
                     (str(self._uid), self._worker.name))
        self._worker.stop()
        self._worker.join()
        logger.debug("pmgr    %s stopped  worker %s" %
                     (str(self._uid), self._worker.name))

        # Remove worker from registry
        self._session._process_registry.remove(self._uid)

        logger.debug("pmgr    %s closed" % (str(self._uid)))
        self._uid = None
Esempio n. 6
0
    def handle_schedule (self, schedule) :

        # we want to use bulk submission to the pilots, so we collect all units
        # assigned to the same set of pilots.  At the same time, we select
        # unscheduled units for later insertion into the wait queue.
        
        if  not schedule :
            logger.debug ('skipping empty unit schedule')
            return

      # print 'handle schedule:'
      # import pprint
      # pprint.pprint (schedule)
      #
        pilot_cu_map = dict()
        unscheduled  = list()

        pilot_ids = self.list_pilots ()

        for unit in schedule['units'].keys() :

            pid = schedule['units'][unit]

            if  None == pid :
                unscheduled.append (unit)
                continue

            else :

                if  pid not in pilot_ids :
                    raise RuntimeError ("schedule points to unknown pilot %s" % pid)

                if  pid not in pilot_cu_map :
                    pilot_cu_map[pid] = list()

                pilot_cu_map[pid].append (unit)


        # submit to all pilots which got something submitted to
        for pid in pilot_cu_map.keys():

            units_to_schedule = list()

            # if a kernel name is in the cu descriptions set, do kernel expansion
            for unit in pilot_cu_map[pid] :

                if  not pid in schedule['pilots'] :
                    # lost pilot, do not schedule unit
                    logger.warn ("unschedule unit %s, lost pilot %s" % (unit.uid, pid))
                    continue

                unit.sandbox = schedule['pilots'][pid]['sandbox'] + "/" + str(unit.uid)

                ud = unit.description

                if  'kernel' in ud and ud['kernel'] :

                    try :
                        from radical.ensemblemd.mdkernels import MDTaskDescription
                    except Exception as ex :
                        logger.error ("Kernels are not supported in" \
                              "compute unit descriptions -- install " \
                              "radical.ensemblemd.mdkernels!")
                        # FIXME: unit needs a '_set_state() method or something!
                        self._session._dbs.set_compute_unit_state (unit._uid, FAILED, 
                                ["kernel expansion failed"])
                        continue

                    pilot_resource = schedule['pilots'][pid]['resource']

                    mdtd           = MDTaskDescription ()
                    mdtd.kernel    = ud.kernel
                    mdtd_bound     = mdtd.bind (resource=pilot_resource)
                    ud.environment = mdtd_bound.environment
                    ud.pre_exec    = mdtd_bound.pre_exec
                    ud.executable  = mdtd_bound.executable
                    ud.mpi         = mdtd_bound.mpi


                units_to_schedule.append (unit)

            if  len(units_to_schedule) :
                self._worker.schedule_compute_units (pilot_uid=pid,
                                                     units=units_to_schedule)


        # report any change in wait_queue_size
        old_wait_queue_size = self.wait_queue_size

        self.wait_queue_size = len(unscheduled)
        if  old_wait_queue_size != self.wait_queue_size :
            self._worker.fire_manager_callback (WAIT_QUEUE_SIZE, self,
                                                self.wait_queue_size)

        if  len(unscheduled) :
            self._worker.unschedule_compute_units (units=unscheduled)

        logger.info ('%s units remain unscheduled' % len(unscheduled))
Esempio n. 7
0
    def register_start_pilot_request(self, pilot, resource_config):
        """Register a new pilot start request with the worker.
        """

        # create a new UID for the pilot
        pilot_uid = ru.generate_id ('pilot')

        # switch endpoint type
        filesystem_endpoint = resource_config['filesystem_endpoint']

        fs = saga.Url(filesystem_endpoint)

        # get the home directory on the remote machine.
        # Note that this will only work for (gsi)ssh or shell based access
        # mechanisms (FIXME)

        import saga.utils.pty_shell as sup

        if fs.port is not None:
            url = "%s://%s:%d/" % (fs.schema, fs.host, fs.port)
        else:
            url = "%s://%s/" % (fs.schema, fs.host)

        logger.debug ("saga.utils.PTYShell ('%s')" % url)
        shell = sup.PTYShell(url, self._session, logger)

        if pilot.description.sandbox :
            workdir_raw = pilot.description.sandbox
        else :
            workdir_raw = resource_config.get ('default_remote_workdir', "$PWD")

        if '$' in workdir_raw or '`' in workdir_raw :
            ret, out, err = shell.run_sync (' echo "WORKDIR: %s"' % workdir_raw)
            if  ret == 0 and 'WORKDIR:' in out :
                workdir_expanded = out.split(":")[1].strip()
                logger.debug("Determined remote working directory for %s: '%s'" % (url, workdir_expanded))
            else :
                error_msg = "Couldn't determine remote working directory."
                logger.error(error_msg)
                raise Exception(error_msg)
        else :
            workdir_expanded = workdir_raw

        # At this point we have determined 'pwd'
        fs.path = "%s/radical.pilot.sandbox" % workdir_expanded

        # This is the base URL / 'sandbox' for the pilot!
        agent_dir_url = saga.Url("%s/%s-%s/" % (str(fs), self._session.uid, pilot_uid))

        # Create a database entry for the new pilot.
        pilot_uid, pilot_json = self._db.insert_pilot(
            pilot_uid=pilot_uid,
            pilot_manager_uid=self._pm_id,
            pilot_description=pilot.description,
            pilot_sandbox=str(agent_dir_url), 
            global_sandbox=str(fs.path)
            )

        # Create a shared data store entry
        self._shared_data[pilot_uid] = {
            'data':          pilot_json,
            'callbacks':     [],
            'facade_object': weakref.ref(pilot)
        }

        return pilot_uid
    def stage_in(self, directives):
        """Stages the content of the staging directive into the pilot's
        staging area"""

        # Wait until we can assume the pilot directory to be created
        if self.state == NEW:
            self.wait(state=[PENDING_LAUNCH, LAUNCHING, PENDING_ACTIVE, ACTIVE])
        elif self.state in [DONE, FAILED, CANCELED]:
            raise Exception("Pilot already finished, no need to stage anymore!")

        # Iterate over all directives
        for directive in expand_staging_directive(directives, logger):

            # TODO: respect flags in directive

            src_url = saga.Url(directive['source'])
            action = directive['action']

            # Convert the target url into a SAGA Url object
            tgt_url = saga.Url(directive['target'])
            # Create a pointer to the directory object that we will use
            tgt_dir_url = tgt_url

            if tgt_url.path.endswith('/'):
                # If the original target was a directory (ends with /),
                # we assume that the user wants the same filename as the source.
                tgt_filename = os.path.basename(src_url.path)
            else:
                # Otherwise, extract the filename and update the directory
                tgt_filename = os.path.basename(tgt_dir_url.path)
                tgt_dir_url.path = os.path.dirname(tgt_dir_url.path)

            # Handle special 'staging' scheme
            if tgt_dir_url.scheme == 'staging':

                # We expect a staging:///relative/path/file.txt URI,
                # as hostname would have unclear semantics currently.
                if tgt_dir_url.host:
                    raise Exception("hostname not supported with staging:// scheme")

                # Remove the leading slash to get a relative path from the staging area
                target = os.path.relpath(tgt_dir_url.path, '/')

                # Now base the target directory relative of the sandbox and staging prefix
                tgt_dir_url = saga.Url(os.path.join(self.sandbox, STAGING_AREA, target))

            # Define and open the staging directory for the pilot
            # We use the target dir construct here, so that we can create
            # the directory if it does not yet exist.
            target_dir = saga.filesystem.Directory(tgt_dir_url, flags=saga.filesystem.CREATE_PARENTS)

            if action == LINK:
                # TODO: Does this make sense?
                #log_message = 'Linking %s to %s' % (source, abs_target)
                #os.symlink(source, abs_target)
                logger.error("action 'LINK' not supported on pilot level staging")
                raise ValueError("action 'LINK' not supported on pilot level staging")
            elif action == COPY:
                # TODO: Does this make sense?
                #log_message = 'Copying %s to %s' % (source, abs_target)
                #shutil.copyfile(source, abs_target)
                logger.error("action 'COPY' not supported on pilot level staging")
                raise ValueError("action 'COPY' not supported on pilot level staging")
            elif action == MOVE:
                # TODO: Does this make sense?
                #log_message = 'Moving %s to %s' % (source, abs_target)
                #shutil.move(source, abs_target)
                logger.error("action 'MOVE' not supported on pilot level staging")
                raise ValueError("action 'MOVE' not supported on pilot level staging")
            elif action == TRANSFER:
                log_message = 'Transferring %s to %s' % (src_url, os.path.join(str(tgt_dir_url), tgt_filename))
                logger.info(log_message)
                # Transfer the source file to the target staging area
                target_dir.copy(src_url, tgt_filename)
            else:
                raise Exception('Action %s not supported' % action)
    def handle_schedule(self, schedule):

        # we want to use bulk submission to the pilots, so we collect all units
        # assigned to the same set of pilots.  At the same time, we select
        # unscheduled units for later insertion into the wait queue.

        if not schedule:
            logger.debug('skipping empty unit schedule')
            return

    # print 'handle schedule:'
    # import pprint
    # pprint.pprint (schedule)
    #
        pilot_cu_map = dict()
        unscheduled = list()

        pilot_ids = self.list_pilots()

        for unit in schedule['units'].keys():

            pid = schedule['units'][unit]

            if None == pid:
                unscheduled.append(unit)
                continue

            else:

                if pid not in pilot_ids:
                    raise RuntimeError("schedule points to unknown pilot %s" %
                                       pid)

                if pid not in pilot_cu_map:
                    pilot_cu_map[pid] = list()

                pilot_cu_map[pid].append(unit)

        # submit to all pilots which got something submitted to
        for pid in pilot_cu_map.keys():

            units_to_schedule = list()

            # if a kernel name is in the cu descriptions set, do kernel expansion
            for unit in pilot_cu_map[pid]:

                if not pid in schedule['pilots']:
                    # lost pilot, do not schedule unit
                    logger.warn("unschedule unit %s, lost pilot %s" %
                                (unit.uid, pid))
                    continue

                unit.sandbox = schedule['pilots'][pid]['sandbox'] + "/" + str(
                    unit.uid)

                ud = unit.description

                if 'kernel' in ud and ud['kernel']:

                    try:
                        from radical.ensemblemd.mdkernels import MDTaskDescription
                    except Exception as ex:
                        logger.error ("Kernels are not supported in" \
                              "compute unit descriptions -- install " \
                              "radical.ensemblemd.mdkernels!")
                        # FIXME: unit needs a '_set_state() method or something!
                        self._session._dbs.set_compute_unit_state(
                            unit._uid, FAILED, ["kernel expansion failed"])
                        continue

                    pilot_resource = schedule['pilots'][pid]['resource']

                    mdtd = MDTaskDescription()
                    mdtd.kernel = ud.kernel
                    mdtd_bound = mdtd.bind(resource=pilot_resource)
                    ud.environment = mdtd_bound.environment
                    ud.pre_exec = mdtd_bound.pre_exec
                    ud.executable = mdtd_bound.executable
                    ud.mpi = mdtd_bound.mpi

                units_to_schedule.append(unit)

            if len(units_to_schedule):
                self._worker.schedule_compute_units(pilot_uid=pid,
                                                    units=units_to_schedule)

        # report any change in wait_queue_size
        old_wait_queue_size = self.wait_queue_size

        self.wait_queue_size = len(unscheduled)
        if old_wait_queue_size != self.wait_queue_size:
            self._worker.fire_manager_callback(WAIT_QUEUE_SIZE, self,
                                               self.wait_queue_size)

        if len(unscheduled):
            self._worker.unschedule_compute_units(units=unscheduled)

        logger.info('%s units remain unscheduled' % len(unscheduled))
Esempio n. 10
0
    def __init__(self,
                 database_url=None,
                 database_name="radicalpilot",
                 uid=None,
                 name=None):
        """Creates a new or reconnects to an exising session.

        If called without a uid, a new Session instance is created and 
        stored in the database. If uid is set, an existing session is 
        retrieved from the database. 

        **Arguments:**
            * **database_url** (`string`): The MongoDB URL.  If none is given,
              RP uses the environment variable RADICAL_PILOT_DBURL.  If that is
              not set, an error will be raises.

            * **database_name** (`string`): An alternative database name 
              (default: 'radicalpilot').

            * **uid** (`string`): If uid is set, we try 
              re-connect to an existing session instead of creating a new one.

            * **name** (`string`): An optional human readable name.

        **Returns:**
            * A new Session instance.

        **Raises:**
            * :class:`radical.pilot.DatabaseError`

        """

        # init the base class inits
        saga.Session.__init__(self)
        Object.__init__(self)

        # before doing anything else, set up the debug helper for the lifetime
        # of the session.
        self._debug_helper = ru.DebugHelper()

        # Dictionaries holding all manager objects created during the session.
        self._pilot_manager_objects = list()
        self._unit_manager_objects = list()

        # Create a new process registry. All objects belonging to this
        # session will register their worker processes (if they have any)
        # in this registry. This makes it easier to shut down things in
        # a more coordinate fashion.
        self._process_registry = _ProcessRegistry()

        # The resource configuration dictionary associated with the session.
        self._resource_configs = {}

        self._database_url = database_url
        self._database_name = database_name

        if not self._database_url:
            self._database_url = os.getenv("RADICAL_PILOT_DBURL", None)

        if not self._database_url:
            raise PilotException("no database URL (set RADICAL_PILOT_DBURL)")

        logger.info("using database url  %s" % self._database_url)

        # if the database url contains a path element, we interpret that as
        # database name (without the leading slash)
        tmp_url = ru.Url(self._database_url)
        if  tmp_url.path            and \
            tmp_url.path[0]  == '/' and \
            len(tmp_url.path) >  1  :
            self._database_name = tmp_url.path[1:]
            logger.info("using database path %s" % self._database_name)
        else:
            logger.info("using database name %s" % self._database_name)

        # Loading all "default" resource configurations
        module_path = os.path.dirname(os.path.abspath(__file__))
        default_cfgs = "%s/configs/*.json" % module_path
        config_files = glob.glob(default_cfgs)

        for config_file in config_files:

            try:
                rcs = ResourceConfig.from_file(config_file)
            except Exception as e:
                logger.error("skip config file %s: %s" % (config_file, e))
                continue

            for rc in rcs:
                logger.info("Loaded resource configurations for %s" % rc)
                self._resource_configs[rc] = rcs[rc].as_dict()

        user_cfgs = "%s/.radical/pilot/configs/*.json" % os.environ.get('HOME')
        config_files = glob.glob(user_cfgs)

        for config_file in config_files:

            try:
                rcs = ResourceConfig.from_file(config_file)
            except Exception as e:
                logger.error("skip config file %s: %s" % (config_file, e))
                continue

            for rc in rcs:
                logger.info("Loaded resource configurations for %s" % rc)

                if rc in self._resource_configs:
                    # config exists -- merge user config into it
                    ru.dict_merge(self._resource_configs[rc],
                                  rcs[rc].as_dict(),
                                  policy='overwrite')
                else:
                    # new config -- add as is
                    self._resource_configs[rc] = rcs[rc].as_dict()

        default_aliases = "%s/configs/aliases.json" % module_path
        self._resource_aliases = ru.read_json_str(default_aliases)['aliases']

        ##########################
        ## CREATE A NEW SESSION ##
        ##########################
        if uid is None:
            try:
                self._connected = None

                if name:
                    self._name = name
                    self._uid = name
                # self._uid  = ru.generate_id ('rp.session.'+name+'.%(item_counter)06d', mode=ru.ID_CUSTOM)
                else:
                    self._uid = ru.generate_id('rp.session',
                                               mode=ru.ID_PRIVATE)
                    self._name = self._uid


                self._dbs, self._created, self._connection_info = \
                        dbSession.new(sid     = self._uid,
                                      name    = self._name,
                                      db_url  = self._database_url,
                                      db_name = database_name)

                logger.info("New Session created%s." % str(self))

            except Exception, ex:
                logger.exception('session create failed')
                raise PilotException("Couldn't create new session (database URL '%s' incorrect?): %s" \
                                % (self._database_url, ex))
Esempio n. 11
0
    def close(self, terminate=True):
        """Shuts down the PilotManager and its background workers in a 
        coordinated fashion.

        **Arguments:**

            * **terminate** [`bool`]: If set to True, all active pilots will 
              get canceled (default: False).

        """

        logger.debug("pmgr    %s closing" % (str(self._uid)))

        # Spit out a warning in case the object was already closed.
        if not self._uid:
            logger.error("PilotManager object already closed.")
            return

        # before we terminate pilots, we have to kill the pilot launcher threads
        # -- otherwise we'll run into continous race conditions due to the
        # ongoing state checks...
        if self._worker is not None:
            # Stop the worker process
            logger.debug("pmgr    %s cancel   worker %s" % (str(self._uid), self._worker.name))
            self._worker.cancel_launcher()
            logger.debug("pmgr    %s canceled worker %s" % (str(self._uid), self._worker.name))



        # If terminate is set, we cancel all pilots. 
        if  terminate :
            # cancel all pilots, make sure they are gone, and close the pilot
            # managers.
            for pilot in self.get_pilots () :
                logger.debug("pmgr    %s cancels  pilot  %s" % (str(self._uid), pilot._uid))
            self.cancel_pilots ()

          # FIXME:
          #
          # wait_pilots() will wait until all pilots picked up the sent cancel
          # signal and died.  However, that can take a loooong time.  For
          # example, if a pilot is in 'PENDING_ACTIVE' state, this will have to
          # wait until the pilot is bootstrapped, started, connected to the DB,
          # and shut down again.  Or, for a pilot which just got a shitload of
          # units, it will have to wait until the pilot started all those units
          # and then checks its command queue again.  Or, if the pilot job
          # already died, wait will block until the state checker kicks in and
          # declares the pilot as dead, which takes a couple of minutes.
          #
          # Solution would be to add a CANCELING state and to wait for that one,
          # too, which basically means to wait until the cancel signal has been
          # sent.  There is not much more to do at this point anyway.  This is at
          # the moment faked in the manager controler, which sets that state
          # after sending the cancel command.  This should be converted into
          # a proper state -- that would, btw, remove the need for a cancel
          # command in the first place, as the pilot can just pull its own state
          # instead, and cancel on CANCELING...
          #
          # self.wait_pilots ()
            wait_for_cancel = True
            all_pilots = self.get_pilots ()
            while wait_for_cancel :
                wait_for_cancel = False
                for pilot in all_pilots :
                    logger.debug("pmgr    %s wait for pilot  %s (%s)" % (str(self._uid), pilot._uid, pilot.state))
                    if  pilot.state not in [DONE, FAILED, CANCELED, CANCELING] :
                        time.sleep (1)
                        wait_for_cancel = True
                        break
            for pilot in self.get_pilots () :
                logger.debug("pmgr    %s canceled pilot  %s" % (str(self._uid), pilot._uid))


        logger.debug("pmgr    %s stops    worker %s" % (str(self._uid), self._worker.name))
        self._worker.stop()
        self._worker.join()
        logger.debug("pmgr    %s stopped  worker %s" % (str(self._uid), self._worker.name))

        # Remove worker from registry
        self._session._process_registry.remove(self._uid)


        logger.debug("pmgr    %s closed" % (str(self._uid)))
        self._uid = None
Esempio n. 12
0
    def _unit_state_callback (self, unit, state) :
        
        try :

            with self.lock :
            
                uid = unit.uid

                logger.info ("[SchedulerCallback]: Computeunit %s changed to %s" % (uid, state))


                found_unit = False
                if  state in [NEW, UNSCHEDULED] :

                    for pid in self.runqs :

                        if  not pid :
                            logger.warning ('cannot handle final unit %s w/o pilot information' % uid)

                        if  uid in self.runqs[pid] :

                            logger.info ('reschedule NEW unit %s from %s' % (uid, pid))

                            unit       = self.runqs[pid][uid]
                            found_unit = True

                            del self.runqs[pid][uid]
                            self.waitq[uid] = unit

                          # self._dump ('before reschedule %s' % uid)
                            self._reschedule (uid=uid)
                          # self._dump ('after  reschedule %s' % uid)

                            return

              # if  not found_unit and uid not in self.waitq :
              #     # as we cannot unregister callbacks, we simply ignore this
              #     # invokation.  Its probably from a unit we handled previously.
              #     # (although this should have been final?)
              #     #
              #     # FIXME: how can I *un*register a unit callback?
              #     logger.error ("[SchedulerCallback]: cannot handle unit %s" % uid)
              #     self._dump()
              #     return

                if  state in [PENDING_OUTPUT_STAGING, STAGING_OUTPUT, DONE, FAILED, CANCELED] :
                    # the pilot which owned this CU should now have free slots available
                    # FIXME: how do I get the pilot from the CU?
                    
                    pid = unit.execution_details.get ('pilot', None)

                    if  not pid :
                        raise RuntimeError ('cannot handle final unit %s w/o pilot information' % uid)

                    if  pid not in self.pilots :
                        logger.warning ('cannot handle unit %s cb for pilot %s (pilot is gone)' % (uid, pid))

                    else :
                        if  uid in self.runqs[pid] :

                            unit = self.runqs[pid][uid]

                            del self.runqs[pid][uid]
                            self.pilots[pid]['caps'] += unit.description.cores
                            self._reschedule (target_pid=pid)
                            found_unit = True

                      #     logger.debug ('unit %s frees %s cores on (-> %s)' \
                      #                % (uid, unit.description.cores, pid, self.pilots[pid]['caps']))

                    if not found_unit :
                        logger.warn ('unit %s freed %s cores on %s (== %s) -- not reused'
                                  % (uid, unit.description.cores, pid, self.pilots[pid]['caps']))


        except Exception as e :
            logger.error ("error in unit callback for backfiller (%s) - ignored" % e)
Esempio n. 13
0
    def __init__ (self, database_url=None, database_name="radicalpilot",
                  uid=None, name=None):
        """Creates a new or reconnects to an exising session.

        If called without a uid, a new Session instance is created and 
        stored in the database. If uid is set, an existing session is 
        retrieved from the database. 

        **Arguments:**
            * **database_url** (`string`): The MongoDB URL.  If none is given,
              RP uses the environment variable RADICAL_PILOT_DBURL.  If that is
              not set, an error will be raises.

            * **database_name** (`string`): An alternative database name 
              (default: 'radicalpilot').

            * **uid** (`string`): If uid is set, we try 
              re-connect to an existing session instead of creating a new one.

            * **name** (`string`): An optional human readable name.

        **Returns:**
            * A new Session instance.

        **Raises:**
            * :class:`radical.pilot.DatabaseError`

        """

        # init the base class inits
        saga.Session.__init__ (self)
        Object.__init__ (self)

        # before doing anything else, set up the debug helper for the lifetime
        # of the session.
        self._debug_helper = ru.DebugHelper ()

        # Dictionaries holding all manager objects created during the session.
        self._pilot_manager_objects = list()
        self._unit_manager_objects = list()

        # Create a new process registry. All objects belonging to this 
        # session will register their worker processes (if they have any)
        # in this registry. This makes it easier to shut down things in 
        # a more coordinate fashion. 
        self._process_registry = _ProcessRegistry()

        # The resource configuration dictionary associated with the session.
        self._resource_configs = {}

        self._database_url  = database_url
        self._database_name = database_name 

        if  not self._database_url :
            self._database_url = os.getenv ("RADICAL_PILOT_DBURL", None)

        if  not self._database_url :
            raise PilotException ("no database URL (set RADICAL_PILOT_DBURL)")  

        logger.info("using database url  %s" % self._database_url)

        # if the database url contains a path element, we interpret that as
        # database name (without the leading slash)
        tmp_url = ru.Url (self._database_url)
        if  tmp_url.path            and \
            tmp_url.path[0]  == '/' and \
            len(tmp_url.path) >  1  :
            self._database_name = tmp_url.path[1:]
            logger.info("using database path %s" % self._database_name)
        else :
            logger.info("using database name %s" % self._database_name)

        # Loading all "default" resource configurations
        module_path   = os.path.dirname(os.path.abspath(__file__))
        default_cfgs  = "%s/configs/*.json" % module_path
        config_files  = glob.glob(default_cfgs)

        for config_file in config_files:

            try :
                rcs = ResourceConfig.from_file(config_file)
            except Exception as e :
                logger.error ("skip config file %s: %s" % (config_file, e))
                continue

            for rc in rcs:
                logger.info("Loaded resource configurations for %s" % rc)
                self._resource_configs[rc] = rcs[rc].as_dict() 

        user_cfgs     = "%s/.radical/pilot/configs/*.json" % os.environ.get ('HOME')
        config_files  = glob.glob(user_cfgs)

        for config_file in config_files:

            try :
                rcs = ResourceConfig.from_file(config_file)
            except Exception as e :
                logger.error ("skip config file %s: %s" % (config_file, e))
                continue

            for rc in rcs:
                logger.info("Loaded resource configurations for %s" % rc)

                if  rc in self._resource_configs :
                    # config exists -- merge user config into it
                    ru.dict_merge (self._resource_configs[rc],
                                   rcs[rc].as_dict(),
                                   policy='overwrite')
                else :
                    # new config -- add as is
                    self._resource_configs[rc] = rcs[rc].as_dict() 

        default_aliases = "%s/configs/aliases.json" % module_path
        self._resource_aliases = ru.read_json_str (default_aliases)['aliases']

        ##########################
        ## CREATE A NEW SESSION ##
        ##########################
        if uid is None:
            try:
                self._connected  = None

                if name :
                    self._name = name
                    self._uid  = name
                  # self._uid  = ru.generate_id ('rp.session.'+name+'.%(item_counter)06d', mode=ru.ID_CUSTOM)
                else :
                    self._uid  = ru.generate_id ('rp.session', mode=ru.ID_PRIVATE)
                    self._name = self._uid


                self._dbs, self._created, self._connection_info = \
                        dbSession.new(sid     = self._uid,
                                      name    = self._name,
                                      db_url  = self._database_url,
                                      db_name = database_name)

                logger.info("New Session created%s." % str(self))

            except Exception, ex:
                logger.exception ('session create failed')
                raise PilotException("Couldn't create new session (database URL '%s' incorrect?): %s" \
                                % (self._database_url, ex))