Example #1
0
    def get_compute_pilot_data(self, pilot_ids=None):
        """Returns the raw data (json dicts) of one or more ComputePilots
           registered with this Worker / PilotManager
        """
        # Wait for the initialized event to assert proper operation.
        self._initialized.wait()

        try:
            if  pilot_ids is None:
                pilot_ids = self._shared_data.keys ()

            return_list_type = True
            if not isinstance(pilot_ids, list):
                return_list_type = False
                pilot_ids = [pilot_ids]

            data = list()
            for pilot_id in pilot_ids:
                data.append(self._shared_data[pilot_id]['data'])

            if  return_list_type :
                return data
            else :
                return data[0]

        except KeyError as e:
            logger.exception ("Unknown Pilot ID %s : %s" % (pilot_id, e))
            raise
Example #2
0
    def call_unit_state_callbacks(self, unit_id, new_state):
        """Wrapper function to call all all relevant callbacks, on unit-level
        as well as manager-level.
        """

        # this is the point where, at the earliest, the application could have
        # been notified about unit state changes.  So we record that event.
        if not unit_id in self._callback_histories:
            self._callback_histories[unit_id] = list()
        self._callback_histories[unit_id].append({
            'timestamp':
            datetime.datetime.utcnow(),
            'state':
            new_state
        })

        for [cb, cb_data] in self._shared_data[unit_id]['callbacks']:
            try:

                if self._shared_data[unit_id]['facade_object']:
                    if cb_data:
                        cb(self._shared_data[unit_id]['facade_object'],
                           new_state, cb_data)
                    else:
                        cb(self._shared_data[unit_id]['facade_object'],
                           new_state)
                else:
                    logger.error("Couldn't call callback (no pilot instance)")
            except Exception as e:
                logger.exception("Couldn't call callback function %s" % e)
                raise

        # If we have any manager-level callbacks registered, we
        # call those as well!
        if not UNIT_STATE in self._manager_callbacks:
            self._manager_callbacks[UNIT_STATE] = list()

        for [cb, cb_data] in self._manager_callbacks[UNIT_STATE]:
            if not self._shared_data[unit_id]['facade_object']:
                logger.warning('skip cb for incomple unit (%s: %s)' %
                               (unit_id, new_state))
                break

            try:
                if cb_data:
                    cb(self._shared_data[unit_id]['facade_object'], new_state,
                       cb_data)
                else:
                    cb(self._shared_data[unit_id]['facade_object'], new_state)
            except Exception as e:
                logger.exception("Couldn't call callback function %s" % e)
                raise

        # If we meet a final state, we record the object's callback history for
        # later evaluation.
        if new_state in (DONE, FAILED, CANCELED):
            self._db.publish_compute_unit_callback_history(
                unit_id, self._callback_histories[unit_id])
    def unschedule_compute_units(self, units):
        """
        set the unit state to UNSCHEDULED
        """

        try:
            unit_ids = [unit.uid for unit in units]
            self._db.set_compute_unit_state(unit_ids, UNSCHEDULED, "unit remains unscheduled")

        except Exception, e:
            logger.exception ('error in unit manager controller (unschedule())')
            raise
    def call_unit_state_callbacks(self, unit_id, new_state):
        """Wrapper function to call all all relevant callbacks, on unit-level
        as well as manager-level.
        """

        # this is the point where, at the earliest, the application could have
        # been notified about unit state changes.  So we record that event.
        if  not unit_id in self._callback_histories :
            self._callback_histories[unit_id] = list()
        self._callback_histories[unit_id].append (
                {'timestamp' : datetime.datetime.utcnow(), 
                 'state'     : new_state})

        for [cb, cb_data] in self._shared_data[unit_id]['callbacks']:
            try:

                if self._shared_data[unit_id]['facade_object'] :
                    if  cb_data :
                        cb(self._shared_data[unit_id]['facade_object'], new_state, cb_data)
                    else :
                        cb(self._shared_data[unit_id]['facade_object'], new_state)
                else :
                    logger.error("Couldn't call callback (no pilot instance)")
            except Exception as e:
                logger.exception(
                    "Couldn't call callback function %s" % e)
                raise

        # If we have any manager-level callbacks registered, we
        # call those as well!
        if  not UNIT_STATE in self._manager_callbacks :
            self._manager_callbacks[UNIT_STATE] = list()

        for [cb, cb_data] in self._manager_callbacks[UNIT_STATE]:
            if not self._shared_data[unit_id]['facade_object'] :
                logger.warning ('skip cb for incomple unit (%s: %s)' % (unit_id, new_state))
                break

            try:
                if  cb_data :
                    cb(self._shared_data[unit_id]['facade_object'], new_state, cb_data)
                else :
                    cb(self._shared_data[unit_id]['facade_object'], new_state)
            except Exception as e:
                logger.exception(
                    "Couldn't call callback function %s" % e)
                raise

        # If we meet a final state, we record the object's callback history for
        # later evaluation.
        if  new_state in (DONE, FAILED, CANCELED) :
            self._db.publish_compute_unit_callback_history (unit_id, self._callback_histories[unit_id])
Example #5
0
    def unschedule_compute_units(self, units):
        """
        set the unit state to UNSCHEDULED
        """

        try:
            unit_ids = [unit.uid for unit in units]
            self._db.set_compute_unit_state(unit_ids, UNSCHEDULED,
                                            "unit remains unscheduled")

        except Exception, e:
            logger.exception('error in unit manager controller (unschedule())')
            raise
    def fire_manager_callback(self, metric, obj, value):
        """Fire a manager-level callback.
        """
        if  not metric in self._manager_callbacks :
            self._manager_callbacks[metric] = list()

        for [cb, cb_data] in self._manager_callbacks[metric] :
            try:
                if  cb_data :
                    cb (obj, value, cb_data)
                else :
                    cb (obj, value)
            except Exception as e:
                logger.exception ("Couldn't call '%s' callback function %s: %s" \
                           % (metric, cb, e))
                raise
Example #7
0
    def fire_manager_callback(self, metric, obj, value):
        """Fire a manager-level callback.
        """
        if not metric in self._manager_callbacks:
            self._manager_callbacks[metric] = list()

        for [cb, cb_data] in self._manager_callbacks[metric]:
            try:
                if cb_data:
                    cb(obj, value, cb_data)
                else:
                    cb(obj, value)
            except Exception as e:
                logger.exception ("Couldn't call '%s' callback function %s: %s" \
                           % (metric, cb, e))
                raise
    def submit_units(self, unit_descriptions):
        """Submits on or more :class:`radical.pilot.ComputeUnit` instances to the
        unit manager.

        **Arguments:**

            * **unit_descriptions** [:class:`radical.pilot.ComputeUnitDescription`
              or list of :class:`radical.pilot.ComputeUnitDescription`]: The
              description of the compute unit instance(s) to create.

        **Returns:**

              * A list of :class:`radical.pilot.ComputeUnit` objects.

        **Raises:**

            * :class:`radical.pilot.PilotException`
        """

        if not self._uid:
            raise IncorrectState(msg="Invalid object instance.")

        return_list_type = True
        if not isinstance(unit_descriptions, list):
            return_list_type = False
            unit_descriptions = [unit_descriptions]

        # we return a list of compute units
        ret = list()

        # the scheduler will return a dictionary of the form:
        #   {
        #     ud_1 : pilot_id_a,
        #     ud_2 : pilot_id_b
        #     ...
        #   }
        #
        # The scheduler may not be able to schedule some units - those will
        # have 'None' as pilot ID.

        units = list()
        for ud in unit_descriptions:

            units.append(
                ComputeUnit.create(unit_description=ud,
                                   unit_manager_obj=self,
                                   local_state=SCHEDULING))

        self._worker.publish_compute_units(units=units)

        schedule = None
        try:
            schedule = self._scheduler.schedule(units=units)

        except Exception as e:
            logger.exception("Internal error - unit scheduler failed")
            raise

        self.handle_schedule(schedule)

        if return_list_type:
            return units
        else:
            return units[0]
Example #9
0
    def __init__(self,
                 database_url=None,
                 database_name="radicalpilot",
                 uid=None,
                 name=None):
        """Creates a new or reconnects to an exising session.

        If called without a uid, a new Session instance is created and 
        stored in the database. If uid is set, an existing session is 
        retrieved from the database. 

        **Arguments:**
            * **database_url** (`string`): The MongoDB URL.  If none is given,
              RP uses the environment variable RADICAL_PILOT_DBURL.  If that is
              not set, an error will be raises.

            * **database_name** (`string`): An alternative database name 
              (default: 'radicalpilot').

            * **uid** (`string`): If uid is set, we try 
              re-connect to an existing session instead of creating a new one.

            * **name** (`string`): An optional human readable name.

        **Returns:**
            * A new Session instance.

        **Raises:**
            * :class:`radical.pilot.DatabaseError`

        """

        # init the base class inits
        saga.Session.__init__(self)
        Object.__init__(self)

        # before doing anything else, set up the debug helper for the lifetime
        # of the session.
        self._debug_helper = ru.DebugHelper()

        # Dictionaries holding all manager objects created during the session.
        self._pilot_manager_objects = list()
        self._unit_manager_objects = list()

        # Create a new process registry. All objects belonging to this
        # session will register their worker processes (if they have any)
        # in this registry. This makes it easier to shut down things in
        # a more coordinate fashion.
        self._process_registry = _ProcessRegistry()

        # The resource configuration dictionary associated with the session.
        self._resource_configs = {}

        self._database_url = database_url
        self._database_name = database_name

        if not self._database_url:
            self._database_url = os.getenv("RADICAL_PILOT_DBURL", None)

        if not self._database_url:
            raise PilotException("no database URL (set RADICAL_PILOT_DBURL)")

        logger.info("using database url  %s" % self._database_url)

        # if the database url contains a path element, we interpret that as
        # database name (without the leading slash)
        tmp_url = ru.Url(self._database_url)
        if  tmp_url.path            and \
            tmp_url.path[0]  == '/' and \
            len(tmp_url.path) >  1  :
            self._database_name = tmp_url.path[1:]
            logger.info("using database path %s" % self._database_name)
        else:
            logger.info("using database name %s" % self._database_name)

        # Loading all "default" resource configurations
        module_path = os.path.dirname(os.path.abspath(__file__))
        default_cfgs = "%s/configs/*.json" % module_path
        config_files = glob.glob(default_cfgs)

        for config_file in config_files:

            try:
                rcs = ResourceConfig.from_file(config_file)
            except Exception as e:
                logger.error("skip config file %s: %s" % (config_file, e))
                continue

            for rc in rcs:
                logger.info("Loaded resource configurations for %s" % rc)
                self._resource_configs[rc] = rcs[rc].as_dict()

        user_cfgs = "%s/.radical/pilot/configs/*.json" % os.environ.get('HOME')
        config_files = glob.glob(user_cfgs)

        for config_file in config_files:

            try:
                rcs = ResourceConfig.from_file(config_file)
            except Exception as e:
                logger.error("skip config file %s: %s" % (config_file, e))
                continue

            for rc in rcs:
                logger.info("Loaded resource configurations for %s" % rc)

                if rc in self._resource_configs:
                    # config exists -- merge user config into it
                    ru.dict_merge(self._resource_configs[rc],
                                  rcs[rc].as_dict(),
                                  policy='overwrite')
                else:
                    # new config -- add as is
                    self._resource_configs[rc] = rcs[rc].as_dict()

        default_aliases = "%s/configs/aliases.json" % module_path
        self._resource_aliases = ru.read_json_str(default_aliases)['aliases']

        ##########################
        ## CREATE A NEW SESSION ##
        ##########################
        if uid is None:
            try:
                self._connected = None

                if name:
                    self._name = name
                    self._uid = name
                # self._uid  = ru.generate_id ('rp.session.'+name+'.%(item_counter)06d', mode=ru.ID_CUSTOM)
                else:
                    self._uid = ru.generate_id('rp.session',
                                               mode=ru.ID_PRIVATE)
                    self._name = self._uid


                self._dbs, self._created, self._connection_info = \
                        dbSession.new(sid     = self._uid,
                                      name    = self._name,
                                      db_url  = self._database_url,
                                      db_name = database_name)

                logger.info("New Session created%s." % str(self))

            except Exception, ex:
                logger.exception('session create failed')
                raise PilotException("Couldn't create new session (database URL '%s' incorrect?): %s" \
                                % (self._database_url, ex))
Example #10
0
    def _pilot_state_callback (self, pilot, state) :
        
        try :

            with self.lock :

                pid = pilot.uid
    
                if  not pid in self.pilots :
                    # as we cannot unregister callbacks, we simply ignore this
                    # invokation.  Its probably from a pilot we used previously.
                    logger.warn ("[SchedulerCallback]: ComputePilot %s changed to %s (ignored)" % (pid, state))
                    return
    
    
                self.pilots[pid]['state'] = state
                logger.debug ("[SchedulerCallback]: ComputePilot %s changed to %s" % (pid, state))
    
                if  state in [ACTIVE] :
                    # the pilot is now ready to be used
                    self._reschedule (target_pid=pid)
    
                if  state in [DONE, FAILED, CANCELED] :

                  # self._dump ('pilot is final')

                    # If the pilot state is 'DONE', 'FAILED' or 'CANCELED', we
                    # need to reschedule the units which are reschedulable --
                    # all others are marked 'FAILED' if they are already
                    # 'EXECUTING' and not restartable
                    timestamp = datetime.datetime.utcnow()
                    self._db.change_compute_units (
                        filter_dict = {"pilot"       : pid, 
                                       "state"       : {"$in": [UNSCHEDULED,
                                                                PENDING_INPUT_STAGING, 
                                                                STAGING_INPUT, 
                                                                PENDING_EXECUTION, 
                                                                SCHEDULING]}},
                        set_dict    = {"state"       : UNSCHEDULED, 
                                       "pilot"       : None},
                        push_dict   = {"statehistory": {"state"     : UNSCHEDULED, 
                                                        "timestamp" : timestamp}, 
                                       "log"         : {"message"   :  "reschedule unit", 
                                                        "timestamp" : timestamp}
                                      })

                    self._db.change_compute_units (
                        filter_dict = {"pilot"       : pid, 
                                       "restartable" : True, 
                                       "state"       : {"$in": [EXECUTING, 
                                                                PENDING_OUTPUT_STAGING, 
                                                                STAGING_OUTPUT]}},
                        set_dict    = {"state"       : UNSCHEDULED,
                                       "pilot"       : None},
                        push_dict   = {"statehistory": {"state"     : UNSCHEDULED,
                                                        "timestamp" : timestamp}, 
                                       "log"         : {"message"   :  "reschedule unit", 
                                                        "timestamp" : timestamp}
                                      })

                    self._db.change_compute_units (
                        filter_dict = {"pilot"       : pid, 
                                       "restartable" : False, 
                                       "state"       : {"$in": [EXECUTING, 
                                                                PENDING_OUTPUT_STAGING, 
                                                                STAGING_OUTPUT]}},
                        set_dict    = {"state"       : FAILED},
                        push_dict   = {"statehistory": {"state"     : FAILED, 
                                                        "timestamp" : timestamp}, 
                                       "log"         : {"message"   :  "reschedule unit", 
                                                        "timestamp" : timestamp}
                                      })

                        # make sure that restartable units got back into the
                        # wait queue
                        #
                        # FIXME AM: f*****g state management: I don't have the
                        # unit state!  New state was just pushed to the DB, but
                        # I have actually no idea for which units, and the state
                        # known to the worker (i.e. the cached state) is most
                        # likely outdated. 
                        #
                        # So we don't handle runq/waitq here.  Instead, we rely
                        # on the unit cb to get invoked as soon as the state
                        # propagated back to us, and then remove them from the
                        # runq.  This is slow, potentially very slow, but save.
                        

                    # we can't use this pilot anymore...  
                    del self.pilots[pid]
                    # FIXME: how can I *un*register a pilot callback?
                    
    
        except Exception as e :
          # import traceback
          # traceback.print_exc ()
            logger.exception ("error in pilot callback for backfiller (%s) - ignored" % e)
            raise
Example #11
0
    def __init__ (self, database_url=None, database_name="radicalpilot",
                  uid=None, name=None):
        """Creates a new or reconnects to an exising session.

        If called without a uid, a new Session instance is created and 
        stored in the database. If uid is set, an existing session is 
        retrieved from the database. 

        **Arguments:**
            * **database_url** (`string`): The MongoDB URL.  If none is given,
              RP uses the environment variable RADICAL_PILOT_DBURL.  If that is
              not set, an error will be raises.

            * **database_name** (`string`): An alternative database name 
              (default: 'radicalpilot').

            * **uid** (`string`): If uid is set, we try 
              re-connect to an existing session instead of creating a new one.

            * **name** (`string`): An optional human readable name.

        **Returns:**
            * A new Session instance.

        **Raises:**
            * :class:`radical.pilot.DatabaseError`

        """

        # init the base class inits
        saga.Session.__init__ (self)
        Object.__init__ (self)

        # before doing anything else, set up the debug helper for the lifetime
        # of the session.
        self._debug_helper = ru.DebugHelper ()

        # Dictionaries holding all manager objects created during the session.
        self._pilot_manager_objects = list()
        self._unit_manager_objects = list()

        # Create a new process registry. All objects belonging to this 
        # session will register their worker processes (if they have any)
        # in this registry. This makes it easier to shut down things in 
        # a more coordinate fashion. 
        self._process_registry = _ProcessRegistry()

        # The resource configuration dictionary associated with the session.
        self._resource_configs = {}

        self._database_url  = database_url
        self._database_name = database_name 

        if  not self._database_url :
            self._database_url = os.getenv ("RADICAL_PILOT_DBURL", None)

        if  not self._database_url :
            raise PilotException ("no database URL (set RADICAL_PILOT_DBURL)")  

        logger.info("using database url  %s" % self._database_url)

        # if the database url contains a path element, we interpret that as
        # database name (without the leading slash)
        tmp_url = ru.Url (self._database_url)
        if  tmp_url.path            and \
            tmp_url.path[0]  == '/' and \
            len(tmp_url.path) >  1  :
            self._database_name = tmp_url.path[1:]
            logger.info("using database path %s" % self._database_name)
        else :
            logger.info("using database name %s" % self._database_name)

        # Loading all "default" resource configurations
        module_path   = os.path.dirname(os.path.abspath(__file__))
        default_cfgs  = "%s/configs/*.json" % module_path
        config_files  = glob.glob(default_cfgs)

        for config_file in config_files:

            try :
                rcs = ResourceConfig.from_file(config_file)
            except Exception as e :
                logger.error ("skip config file %s: %s" % (config_file, e))
                continue

            for rc in rcs:
                logger.info("Loaded resource configurations for %s" % rc)
                self._resource_configs[rc] = rcs[rc].as_dict() 

        user_cfgs     = "%s/.radical/pilot/configs/*.json" % os.environ.get ('HOME')
        config_files  = glob.glob(user_cfgs)

        for config_file in config_files:

            try :
                rcs = ResourceConfig.from_file(config_file)
            except Exception as e :
                logger.error ("skip config file %s: %s" % (config_file, e))
                continue

            for rc in rcs:
                logger.info("Loaded resource configurations for %s" % rc)

                if  rc in self._resource_configs :
                    # config exists -- merge user config into it
                    ru.dict_merge (self._resource_configs[rc],
                                   rcs[rc].as_dict(),
                                   policy='overwrite')
                else :
                    # new config -- add as is
                    self._resource_configs[rc] = rcs[rc].as_dict() 

        default_aliases = "%s/configs/aliases.json" % module_path
        self._resource_aliases = ru.read_json_str (default_aliases)['aliases']

        ##########################
        ## CREATE A NEW SESSION ##
        ##########################
        if uid is None:
            try:
                self._connected  = None

                if name :
                    self._name = name
                    self._uid  = name
                  # self._uid  = ru.generate_id ('rp.session.'+name+'.%(item_counter)06d', mode=ru.ID_CUSTOM)
                else :
                    self._uid  = ru.generate_id ('rp.session', mode=ru.ID_PRIVATE)
                    self._name = self._uid


                self._dbs, self._created, self._connection_info = \
                        dbSession.new(sid     = self._uid,
                                      name    = self._name,
                                      db_url  = self._database_url,
                                      db_name = database_name)

                logger.info("New Session created%s." % str(self))

            except Exception, ex:
                logger.exception ('session create failed')
                raise PilotException("Couldn't create new session (database URL '%s' incorrect?): %s" \
                                % (self._database_url, ex))  
    def schedule_compute_units(self, pilot_uid, units):
        """Request the scheduling of one or more ComputeUnits on a
           ComputePilot.
        """

        try:
            cu_transfer   = list()
            cu_notransfer = list()

            # Get some information about the pilot sandbox from the database.
            pilot_info = self._db.get_pilots(pilot_ids=pilot_uid)
            # TODO: this hack below relies on what?! That there is just one pilot?
            pilot_sandbox = pilot_info[0]['sandbox']

            # Split units into two different lists: the first list contains the CUs
            # that need file transfer and the second list contains the CUs that
            # don't. The latter is added to the pilot directly, while the former
            # is added to the transfer queue.
            for unit in units:

                # Create object for staging status tracking
                unit.FTW_Input_Status = None
                unit.FTW_Input_Directives = []
                unit.Agent_Input_Status = None
                unit.Agent_Input_Directives = []
                unit.FTW_Output_Status = None
                unit.FTW_Output_Directives = []
                unit.Agent_Output_Status = None
                unit.Agent_Output_Directives = []

                # Split the input staging directives over the transfer worker and the agent
                input_sds = unit.description.input_staging
                if not isinstance(input_sds, list):
                    # Ugly, but is a workaround for iterating on attribute interface
                    # TODO: Verify if this piece of code is actually still required
                    if input_sds:
                        input_sds = [input_sds]
                    else:
                        input_sds = []

                for input_sd_entry in input_sds:
                    action = input_sd_entry['action']
                    source = Url(input_sd_entry['source'])
                    target = Url(input_sd_entry['target'])

                    new_sd = {'action':   action,
                              'source':   str(source),
                              'target':   str(target),
                              'flags':    input_sd_entry['flags'],
                              'priority': input_sd_entry['priority'],
                              'state':    PENDING
                    }

                    if action in [LINK, COPY, MOVE]:
                        unit.Agent_Input_Directives.append(new_sd)
                        unit.Agent_Input_Status = PENDING
                    elif action in [TRANSFER]:
                        if source.scheme and source.scheme != 'file':
                            # If there is a scheme and it is different than "file",
                            # assume a remote pull from the agent
                            unit.Agent_Input_Directives.append(new_sd)
                            unit.Agent_Input_Status = PENDING
                        else:
                            # Transfer from local to sandbox
                            unit.FTW_Input_Directives.append(new_sd)
                            unit.FTW_Input_Status = PENDING
                    else:
                        logger.warn('Not sure if action %s makes sense for input staging' % action)

                # Split the output staging directives over the transfer worker and the agent
                output_sds = unit.description.output_staging
                if not isinstance(output_sds, list):
                    # Ugly, but is a workaround for iterating on att iface
                    # TODO: Verify if this piece of code is actually still required
                    if output_sds:
                        output_sds = [output_sds]
                    else:
                        output_sds = []

                for output_sds_entry in output_sds:
                    action = output_sds_entry['action']
                    source = Url(output_sds_entry['source'])
                    target = Url(output_sds_entry['target'])

                    new_sd = {'action':   action,
                              'source':   str(source),
                              'target':   str(target),
                              'flags':    output_sds_entry['flags'],
                              'priority': output_sds_entry['priority'],
                              'state':    PENDING
                    }

                    if action == LINK or action == COPY or action == MOVE:
                        unit.Agent_Output_Directives.append(new_sd)
                        unit.Agent_Output_Status = NEW
                    elif action == TRANSFER:
                        if target.scheme and target.scheme != 'file':
                            # If there is a scheme and it is different than "file",
                            # assume a remote push from the agent
                            unit.Agent_Output_Directives.append(new_sd)
                            unit.Agent_Output_Status = NEW
                        else:
                            # Transfer from sandbox back to local
                            unit.FTW_Output_Directives.append(new_sd)
                            unit.FTW_Output_Status = NEW
                    else:
                        logger.warn('Not sure if action %s makes sense for output staging' % action)

                if unit.FTW_Input_Directives or unit.Agent_Input_Directives:
                    log = "Scheduled for data transfer to ComputePilot %s." % pilot_uid
                    self._db.set_compute_unit_state(unit.uid, PENDING_INPUT_STAGING, log)
                    cu_transfer.append(unit)
                else:
                    cu_notransfer.append(unit)

            # Bulk-add all non-transfer units-
            self._db.assign_compute_units_to_pilot(
                units=cu_notransfer,
                pilot_uid=pilot_uid,
                pilot_sandbox=pilot_sandbox
            )

            self._db.assign_compute_units_to_pilot(
                units=cu_transfer,
                pilot_uid=pilot_uid,
                pilot_sandbox=pilot_sandbox
            )

            for unit in cu_notransfer:
                log = "Scheduled for execution on ComputePilot %s." % pilot_uid
                self._db.set_compute_unit_state(unit.uid, PENDING_EXECUTION, log)
                #self._set_state(uid, PENDING_EXECUTION, log)

            logger.info(
                "Scheduled ComputeUnits %s for execution on ComputePilot '%s'." %
                (cu_notransfer, pilot_uid)
            )
        except Exception, e:
            logger.exception ('error in unit manager controller (schedule())')
            raise
    def run(self):
        """Starts the process when Process.start() is called.
        """

        # make sure to catch sys.exit (which raises SystemExit)
        try :

            logger.info("Starting InputFileTransferWorker")

            # Try to connect to the database and create a tailable cursor.
            try:
                connection = self.db_connection_info.get_db_handle()
                db = connection[self.db_connection_info.dbname]
                um_col = db["%s.cu" % self.db_connection_info.session_id]
                logger.debug("Connected to MongoDB. Serving requests for UnitManager %s." % self.unit_manager_id)

            except Exception as e :
                logger.exception("Connection error: %s" % e)
                raise

            try :
                while not self._stop.is_set():
                    # See if we can find a ComputeUnit that is waiting for
                    # input file transfer.
                    compute_unit = None

                    ts = datetime.datetime.utcnow()
                    compute_unit = um_col.find_and_modify(
                        query={"unitmanager": self.unit_manager_id,
                               "FTW_Input_Status": PENDING},
                        update={"$set" : {"FTW_Input_Status": EXECUTING,
                                          "state": STAGING_INPUT},
                                "$push": {"statehistory": {"state": STAGING_INPUT, "timestamp": ts}}},
                        limit=BULK_LIMIT # TODO: bulklimit is probably not the best way to ensure there is just one
                    )
                    # FIXME: AM: find_and_modify is not bulkable!
                    state = STAGING_INPUT

                    if compute_unit is None:
                        # Sleep a bit if no new units are available.
                        time.sleep(IDLE_TIME) 

                    else:
                        compute_unit_id = None
                        try:
                            log_messages = []

                            # We have found a new CU. Now we can process the transfer
                            # directive(s) wit SAGA.
                            compute_unit_id = str(compute_unit["_id"])
                            remote_sandbox = compute_unit["sandbox"]
                            input_staging = compute_unit["FTW_Input_Directives"]

                            # We need to create the CU's directory in case it doesn't exist yet.
                            log_msg = "Creating ComputeUnit sandbox directory %s." % remote_sandbox
                            log_messages.append(log_msg)
                            logger.info(log_msg)

                            # Creating the sandbox directory.
                            try:
                                logger.debug ("saga.fs.Directory ('%s')" % remote_sandbox)

                                remote_sandbox_keyurl = saga.Url (remote_sandbox)
                                remote_sandbox_keyurl.path = '/'
                                remote_sandbox_key = str(remote_sandbox_keyurl)

                                if  remote_sandbox_key not in self._saga_dirs :
                                    self._saga_dirs[remote_sandbox_key] = \
                                            saga.filesystem.Directory (remote_sandbox_key,
                                                    flags=saga.filesystem.CREATE_PARENTS,
                                                    session=self._session)

                                saga_dir = self._saga_dirs[remote_sandbox_key]
                                saga_dir.make_dir (remote_sandbox, 
                                                   flags=saga.filesystem.CREATE_PARENTS)
                            except Exception as e :
                                logger.exception('Error: %s' % e)
                                # FIXME: why is this exception ignored?  AM


                            logger.info("Processing input file transfers for ComputeUnit %s" % compute_unit_id)
                            # Loop over all transfer directives and execute them.
                            for sd in input_staging:

                                state_doc = um_col.find_one(
                                    {"_id": compute_unit_id},
                                    fields=["state"]
                                )
                                if state_doc['state'] == CANCELED:
                                    logger.info("Compute Unit Canceled, interrupting input file transfers.")
                                    state = CANCELED
                                    break

                                abs_src = os.path.abspath(sd['source'])
                                input_file_url = saga.Url("file://localhost/%s" % abs_src)
                                if not sd['target']:
                                    target = remote_sandbox
                                else:
                                    target = "%s/%s" % (remote_sandbox, sd['target'])

                                log_msg = "Transferring input file %s -> %s" % (input_file_url, target)
                                log_messages.append(log_msg)
                                logger.debug(log_msg)

                                # Execute the transfer.
                                logger.debug ("saga.fs.File ('%s')" % input_file_url)
                                input_file = saga.filesystem.File(
                                    input_file_url,
                                    session=self._session
                                )

                                if CREATE_PARENTS in sd['flags']:
                                    copy_flags = saga.filesystem.CREATE_PARENTS
                                else:
                                    copy_flags = 0

                                try :
                                    input_file.copy(target, flags=copy_flags)
                                except Exception as e :
                                    logger.exception (e)
                                input_file.close()

                                # If all went fine, update the state of this StagingDirective to Done
                                um_col.find_and_modify(
                                    query={"_id" : compute_unit_id,
                                           'FTW_Input_Status': EXECUTING,
                                           'FTW_Input_Directives.state': PENDING,
                                           'FTW_Input_Directives.source': sd['source'],
                                           'FTW_Input_Directives.target': sd['target'],
                                           },
                                    update={'$set': {'FTW_Input_Directives.$.state': 'Done'},
                                            '$push': {'log': {
                                                'timestamp': datetime.datetime.utcnow(), 
                                                'message'  : log_msg}}
                                    }
                                )

                        except Exception as e :
                            # Update the CU's state 'FAILED'.
                            ts = datetime.datetime.utcnow()
                            logentry = {'message'  : "Input transfer failed: %s" % e,
                                        'timestamp': ts}

                            um_col.update({'_id': compute_unit_id}, {
                                '$set': {'state': FAILED},
                                '$push': {
                                    'statehistory': {'state': FAILED, 'timestamp': ts},
                                    'log': logentry
                                }
                            })

                            logger.exception(str(logentry))

                    # Code below is only to be run by the "first" or only worker
                    if self._worker_number > 1:
                        continue

                    # If the CU was canceled we can skip the remainder of this loop.
                    if state == CANCELED:
                        continue

                    #
                    # Check to see if there are more pending Directives, if not, we are Done
                    #
                    cursor_w = um_col.find({"unitmanager": self.unit_manager_id,
                                            "$or": [ {"Agent_Input_Status": EXECUTING},
                                                     {"FTW_Input_Status": EXECUTING}
                                                   ]
                                            }
                                           )
                    # Iterate over all the returned CUs (if any)
                    for cu in cursor_w:
                        # See if there are any FTW Input Directives still pending
                        if cu['FTW_Input_Status'] == EXECUTING and \
                                not any(d['state'] == EXECUTING or d['state'] == PENDING for d in cu['FTW_Input_Directives']):
                            # All Input Directives for this FTW are done, mark the CU accordingly
                            um_col.update({"_id": cu["_id"]},
                                          {'$set': {'FTW_Input_Status': DONE},
                                           '$push': {'log': {
                                                'timestamp': datetime.datetime.utcnow(),
                                                'message'  : 'All FTW Input Staging Directives done - %d.' % self._worker_number}}
                                           }
                            )

                        # See if there are any Agent Input Directives still pending or executing,
                        # if not, mark it DONE.
                        if cu['Agent_Input_Status'] == EXECUTING and \
                                not any(d['state'] == EXECUTING or d['state'] == PENDING for d in cu['Agent_Input_Directives']):
                            # All Input Directives for this Agent are done, mark the CU accordingly
                            um_col.update({"_id": cu["_id"]},
                                           {'$set': {'Agent_Input_Status': DONE},
                                            '$push': {'log': {
                                                'timestamp': datetime.datetime.utcnow(), 
                                                'message'  : 'All Agent Input Staging Directives done - %d.' % self._worker_number}}
                                           }
                            )

                    #
                    # Check for all CUs if both Agent and FTW staging is done, we can then mark the CU PendingExecution
                    #
                    ts = datetime.datetime.utcnow()
                    um_col.find_and_modify(
                        query={"unitmanager": self.unit_manager_id,
                               "Agent_Input_Status": { "$in": [ None, DONE ] },
                               "FTW_Input_Status": { "$in": [ None, DONE ] },
                               "state": STAGING_INPUT
                        },
                        update={"$set": {
                                    "state": PENDING_EXECUTION
                                },
                                "$push": {
                                    "statehistory": {"state": PENDING_EXECUTION, "timestamp": ts}
                                }
                        }
                    )

            except Exception as e :

                logger.exception("transfer worker error: %s" % e)
                self._session.close (cleanup=False)
                raise

        except SystemExit as e :
            logger.debug("input file transfer thread caught system exit -- forcing application shutdown")
            import thread
            thread.interrupt_main ()
Example #14
0
    def submit_units(self, unit_descriptions):
        """Submits on or more :class:`radical.pilot.ComputeUnit` instances to the
        unit manager.

        **Arguments:**

            * **unit_descriptions** [:class:`radical.pilot.ComputeUnitDescription`
              or list of :class:`radical.pilot.ComputeUnitDescription`]: The
              description of the compute unit instance(s) to create.

        **Returns:**

              * A list of :class:`radical.pilot.ComputeUnit` objects.

        **Raises:**

            * :class:`radical.pilot.PilotException`
        """

        if not self._uid:
            raise IncorrectState(msg="Invalid object instance.")

        return_list_type = True
        if not isinstance(unit_descriptions, list):
            return_list_type  = False
            unit_descriptions = [unit_descriptions]

        # we return a list of compute units
        ret = list()

        # the scheduler will return a dictionary of the form:
        #   {
        #     ud_1 : pilot_id_a,
        #     ud_2 : pilot_id_b
        #     ...
        #   }
        #
        # The scheduler may not be able to schedule some units - those will
        # have 'None' as pilot ID.

        units = list()
        for ud in unit_descriptions :

            units.append (ComputeUnit.create (unit_description=ud,
                                              unit_manager_obj=self, 
                                              local_state=SCHEDULING))

        self._worker.publish_compute_units (units=units)

        schedule = None
        try:
            schedule = self._scheduler.schedule (units=units)
       
        except Exception as e:
            logger.exception ("Internal error - unit scheduler failed")
            raise 

        self.handle_schedule (schedule)

        if  return_list_type :
            return units
        else :
            return units[0]
Example #15
0
    def run(self):
        """run() is called when the process is started via
           PilotManagerController.start().
        """

        # make sure to catch sys.exit (which raises SystemExit)
        try :

            logger.debug("Worker thread (ID: %s[%s]) for PilotManager %s started." %
                        (self.name, self.ident, self._pm_id))

            while not self._stop.is_set():

                # # Check if one or more startup requests have finished.
                # self.startup_results_lock.acquire()

                # new_startup_results = list()

                # for transfer_result in self.startup_results:
                #     if transfer_result.ready():
                #         result = transfer_result.get()

                #         self._db.update_pilot_state(
                #             pilot_uid=result["pilot_uid"],
                #             state=result["state"],
                #             sagajobid=result["saga_job_id"],
                #             pilot_sandbox=result["sandbox"],
                #             global_sandbox=result["global_sandbox"],
                #             submitted=result["submitted"],
                #             logs=result["logs"]
                #         )

                #     else:
                #         new_startup_results.append(transfer_result)

                # self.startup_results = new_startup_results

                # self.startup_results_lock.release()

                # Check and update pilots. This needs to be optimized at
                # some point, i.e., state pulling should be conditional
                # or triggered by a tailable MongoDB cursor, etc.
                pilot_list = self._db.get_pilots(pilot_manager_id=self._pm_id)
                action = False

                for pilot in pilot_list:
                    pilot_id = str(pilot["_id"])

                    new_state = pilot["state"]
                    if pilot_id in self._shared_data:
                        old_state = self._shared_data[pilot_id]["data"]["state"]
                    else:
                        old_state = None
                        self._shared_data[pilot_id] = {
                            'data':          pilot,
                            'callbacks':     [],
                            'facade_object': None
                        }

                    self._shared_data[pilot_id]['data'] = pilot

                    # FIXME: *groan* what a hack...  The Canceling state is by
                    # the nature of it not recorded in the database, but only in
                    # the local cache.  So if we see it as old state, we have to
                    # avoid state transitions into non-final states in the cache
                    # at all cost -- so we catch this here specifically
                    no_cb = False
                    if  old_state == CANCELING :
                        if  new_state not in [DONE, FAILED, CANCELED] :
                            # restore old state, making the cache explicitly
                            # different than the DB recorded state
                            self._shared_data[pilot_id]["data"]["state"] = old_state 

                            # do not tr igger a state cb!
                            no_cb = True

                    if new_state != old_state :
                        action = True

                        if not no_cb :
                            # On a state change, we fire zee callbacks.
                            logger.info("ComputePilot '%s' state changed from '%s' to '%s'." \
                                            % (pilot_id, old_state, new_state))

                            # The state of the pilot has changed, We call all
                            # pilot-level callbacks to propagate this.  This also
                            # includes communication to the unit scheduler which
                            # may, or may not, cancel the pilot's units.
                            self.call_callbacks(pilot_id, new_state)

                    # If the state is 'DONE', 'FAILED' or 'CANCELED', we also
                    # set the state of the compute unit accordingly (but only
                    # for non-final units)
                    if new_state in [FAILED, DONE, CANCELED]:
                        unit_ids = self._db.pilot_list_compute_units(pilot_uid=pilot_id)
                        self._db.set_compute_unit_state (
                            unit_ids=unit_ids, 
                            state=CANCELED,
                            src_states=[ PENDING_INPUT_STAGING,
                                         STAGING_INPUT,
                                         PENDING_EXECUTION,
                                         SCHEDULING,
                                         EXECUTING,
                                         PENDING_OUTPUT_STAGING,
                                         STAGING_OUTPUT
                                       ],
                            log="Pilot '%s' has terminated with state '%s'. CU canceled." % (pilot_id, new_state))

                # After the first iteration, we are officially initialized!
                if not self._initialized.is_set():
                    self._initialized.set()

                # sleep a little if this cycle was idle
                if  not action :
                    time.sleep(IDLE_TIME)

        except SystemExit as e :
            logger.exception ("pilot manager controller thread caught system exit -- forcing application shutdown")
            import thread
            thread.interrupt_main ()

        finally :
            # shut down the autonomous pilot launcher worker(s)
            for worker in self._pilot_launcher_worker_pool:
                logger.debug("pworker %s stops   launcher %s" % (self.name, worker.name))
                worker.stop ()
                logger.debug("pworker %s stopped launcher %s" % (self.name, worker.name))
    def run(self):
        """Starts the process when Process.start() is called.
        """

        # make sure to catch sys.exit (which raises SystemExit)
        try:
            # Get directory where this module lives
            mod_dir = os.path.dirname(os.path.realpath(__file__))

            # Try to connect to the database
            try:
                connection = self.db_connection_info.get_db_handle()
                db = connection[self.db_connection_info.dbname]
                pilot_col = db["%s.p" % self.db_connection_info.session_id]
                logger.debug(
                    "Connected to MongoDB. Serving requests for PilotManager %s."
                    % self.pilot_manager_id)

            except Exception as e:
                logger.exception("Connection error: %s" % e)
                return

            last_job_check = time.time()

            while not self._stop.is_set():

                # Periodically, we pull up all ComputePilots that are pending
                # execution or were last seen executing and check if the corresponding
                # SAGA job is still pending in the queue. If that is not the case,
                # we assume that the job has failed for some reasons and update
                # the state of the ComputePilot accordingly.
                if last_job_check + JOB_CHECK_INTERVAL < time.time():
                    last_job_check = time.time()
                    self.check_pilot_states(pilot_col)

                # See if we can find a ComputePilot that is waiting to be launched.
                # If we find one, we use SAGA to create a job service, a job
                # description and a job that is then send to the local or remote
                # queueing system. If this succedes, we set the ComputePilot's
                # state to pending, otherwise to failed.
                compute_pilot = None

                ts = datetime.datetime.utcnow()
                compute_pilot = pilot_col.find_and_modify(
                    query={
                        "pilotmanager": self.pilot_manager_id,
                        "state": PENDING_LAUNCH
                    },
                    update={
                        "$set": {
                            "state": LAUNCHING
                        },
                        "$push": {
                            "statehistory": {
                                "state": LAUNCHING,
                                "timestamp": ts
                            }
                        }
                    })

                if not compute_pilot:
                    time.sleep(IDLE_TIMER)

                else:
                    try:
                        # ------------------------------------------------------
                        #
                        # LAUNCH THE PILOT AGENT VIA SAGA
                        #
                        logentries = []
                        pilot_id = str(compute_pilot["_id"])

                        logger.info("Launching ComputePilot %s" % pilot_id)

                        # ------------------------------------------------------
                        # Database connection parameters
                        session_uid = self.db_connection_info.session_id
                        database_url = self.db_connection_info.dburl
                        database_name = self.db_connection_info.dbname
                        database_auth = self.db_connection_info.dbauth

                        # ------------------------------------------------------
                        # pilot description and resource configuration
                        number_cores = compute_pilot['description']['cores']
                        runtime = compute_pilot['description']['runtime']
                        queue = compute_pilot['description']['queue']
                        project = compute_pilot['description']['project']
                        cleanup = compute_pilot['description']['cleanup']
                        resource_key = compute_pilot['description']['resource']
                        schema = compute_pilot['description']['access_schema']
                        memory = compute_pilot['description']['memory']
                        pilot_sandbox = compute_pilot['sandbox']
                        global_sandbox = compute_pilot['global_sandbox']

                        # we expand and exchange keys in the resource config,
                        # depending on the selected schema so better use a deep
                        # copy..
                        resource_cfg = self._session.get_resource_config(
                            resource_key, schema)

                        # import pprint
                        # pprint.pprint (resource_cfg)

                        # ------------------------------------------------------
                        # get parameters from cfg, set defaults where needed
                        agent_mongodb_endpoint = resource_cfg.get(
                            'agent_mongodb_endpoint', database_url)
                        agent_spawner = resource_cfg.get(
                            'agent_spawner', DEFAULT_AGENT_SPAWNER)
                        agent_type = resource_cfg.get('agent_type',
                                                      DEFAULT_AGENT_TYPE)
                        agent_scheduler = resource_cfg.get('agent_scheduler')
                        tunnel_bind_device = resource_cfg.get(
                            'tunnel_bind_device')
                        default_queue = resource_cfg.get('default_queue')
                        forward_tunnel_endpoint = resource_cfg.get(
                            'forward_tunnel_endpoint')
                        js_endpoint = resource_cfg.get('job_manager_endpoint')
                        lrms = resource_cfg.get('lrms')
                        mpi_launch_method = resource_cfg.get(
                            'mpi_launch_method')
                        pre_bootstrap = resource_cfg.get('pre_bootstrap')
                        python_interpreter = resource_cfg.get(
                            'python_interpreter')
                        spmd_variation = resource_cfg.get('spmd_variation')
                        task_launch_method = resource_cfg.get(
                            'task_launch_method')
                        rp_version = resource_cfg.get('rp_version',
                                                      DEFAULT_RP_VERSION)
                        virtenv_mode = resource_cfg.get(
                            'virtenv_mode', DEFAULT_VIRTENV_MODE)
                        virtenv = resource_cfg.get('virtenv', DEFAULT_VIRTENV)
                        stage_cacerts = resource_cfg.get(
                            'stage_cacerts', 'False')

                        if stage_cacerts.lower() == 'true':
                            stage_cacerts = True
                        else:
                            stage_cacerts = False

                        # expand variables in virtenv string
                        virtenv = virtenv % {
                            'pilot_sandbox': saga.Url(pilot_sandbox).path,
                            'global_sandbox': saga.Url(global_sandbox).path
                        }

                        # Check for deprecated global_virtenv
                        global_virtenv = resource_cfg.get('global_virtenv')
                        if global_virtenv:
                            logger.warn(
                                "'global_virtenv' keyword is deprecated -- use 'virtenv' and 'virtenv_mode'"
                            )
                            virtenv = global_virtenv
                            virtenv_mode = 'use'

                        # set default scheme, host, port and dbname if not set
                        db_url = saga.Url(agent_mongodb_endpoint)
                        if not db_url.scheme: db_url.scheme = 'mongodb'
                        if not db_url.host: db_url.host = 'localhost'
                        if not db_url.port: db_url.port = 27017
                        if not database_name: database_name = 'radicalpilot'

                        # Create a host:port string for use by the bootstrapper.
                        database_hostport = "%s:%d" % (db_url.host,
                                                       db_url.port)

                        # ------------------------------------------------------
                        # Copy the bootstrap shell script.  This also creates
                        # the sandbox. We use always "default_bootstrapper.sh"
                        bootstrapper = 'default_bootstrapper.sh'
                        bootstrapper_path = os.path.abspath("%s/../bootstrapper/%s" \
                                % (mod_dir, bootstrapper))

                        msg = "Using bootstrapper %s" % bootstrapper_path
                        logentries.append(Logentry(msg, logger=logger.info))

                        bs_script_url = saga.Url("file://localhost/%s" %
                                                 bootstrapper_path)
                        bs_script_tgt = saga.Url("%s/pilot_bootstrapper.sh" %
                                                 pilot_sandbox)

                        msg = "Copying bootstrapper '%s' to agent sandbox (%s)." \
                                % (bs_script_url, bs_script_tgt)
                        logentries.append(Logentry(msg, logger=logger.debug))

                        bs_script = saga.filesystem.File(bs_script_url,
                                                         session=self._session)
                        bs_script.copy(bs_script_tgt,
                                       flags=saga.filesystem.CREATE_PARENTS)
                        bs_script.close()

                        # ------------------------------------------------------
                        # the version of the agent is derived from
                        # rp_version, which has the following format
                        # and interpretation:
                        #
                        # case rp_version:
                        #   @<token>:
                        #   @tag/@branch/@commit: # no sdist staging
                        #       git clone $github_base radical.pilot.src
                        #       (cd radical.pilot.src && git checkout token)
                        #       pip install -t $VIRTENV/rp_install/ radical.pilot.src
                        #       rm -rf radical.pilot.src
                        #       export PYTHONPATH=$VIRTENV/rp_install:$PYTHONPATH
                        #
                        #   release: # no sdist staging
                        #       pip install -t $VIRTENV/rp_install radical.pilot
                        #       export PYTHONPATH=$VIRTENV/rp_install:$PYTHONPATH
                        #
                        #   local: # needs sdist staging
                        #       tar zxf $sdist.tgz
                        #       pip install -t $VIRTENV/rp_install $sdist/
                        #       export PYTHONPATH=$VIRTENV/rp_install:$PYTHONPATH
                        #
                        #   debug: # needs sdist staging
                        #       tar zxf $sdist.tgz
                        #       pip install -t $SANDBOX/rp_install $sdist/
                        #       export PYTHONPATH=$SANDBOX/rp_install:$PYTHONPATH
                        #
                        #   installed: # no sdist staging
                        #       true
                        # esac
                        #
                        # virtenv_mode
                        #   private : error  if ve exists, otherwise create, then use
                        #   update  : update if ve exists, otherwise create, then use
                        #   create  : use    if ve exists, otherwise create, then use
                        #   use     : use    if ve exists, otherwise error,  then exit
                        #   recreate: delete if ve exists, otherwise create, then use
                        #
                        # examples   :
                        #   [email protected]
                        #   virtenv@devel
                        #   virtenv@release
                        #   virtenv@installed
                        #   stage@local
                        #   stage@/tmp/my_agent.py
                        #
                        # Note that some combinations may be invalid,
                        # specifically in the context of virtenv_mode.  If, for
                        # example, virtenv_mode is 'use', then the 'virtenv:tag'
                        # will not make sense, as the virtenv is not updated.
                        # In those cases, the virtenv_mode is honored, and
                        # a warning is printed.
                        #
                        # Also, the 'stage' mode can only be combined with the
                        # 'local' source, or with a path to the agent (relative
                        # to mod_dir, or absolute).
                        #
                        # A rp_version which does not adhere to the
                        # above syntax is ignored, and the fallback stage@local
                        # is used.

                        if  not rp_version.startswith('@') and \
                            not rp_version in ['installed', 'local', 'debug']:
                            raise ValueError("invalid rp_version '%s'" %
                                             rp_version)

                        stage_sdist = True
                        if rp_version in ['installed', 'release']:
                            stage_sdist = False

                        if rp_version.startswith('@'):
                            stage_sdist = False
                            rp_version = rp_version[1:]  # strip '@'

                        # ------------------------------------------------------
                        # Copy the rp sdist if needed.  We actually also stage
                        # the sdists for radical.utils and radical.saga, so that
                        # we have the complete stack to install...
                        if stage_sdist:

                            for path in [
                                    ru.sdist_path, saga.sdist_path, sdist_path
                            ]:

                                sdist_url = saga.Url("file://localhost/%s" %
                                                     path)
                                msg = "Copying sdist '%s' to sdist sandbox (%s)." % (
                                    sdist_url, pilot_sandbox)
                                logentries.append(
                                    Logentry(msg, logger=logger.debug))

                                sdist_file = saga.filesystem.File(sdist_url)
                                sdist_file.copy("%s/" % (str(pilot_sandbox)))
                                sdist_file.close()

                        # ------------------------------------------------------
                        # some machines cannot run pip due to outdated ca certs.
                        # For those, we also stage an updated cert bundle
                        if stage_cacerts:
                            cc_path = os.path.abspath("%s/../bootstrapper/%s" \
                                    % (mod_dir, 'cacert.pem.gz'))

                            cc_script_url = saga.Url("file://localhost/%s" %
                                                     cc_path)
                            cc_script_tgt = saga.Url("%s/cacert.pem.gz" %
                                                     pilot_sandbox)

                            cc_script = saga.filesystem.File(
                                cc_script_url, session=self._session)
                            cc_script.copy(
                                cc_script_tgt,
                                flags=saga.filesystem.CREATE_PARENTS)
                            cc_script.close()

                        # ------------------------------------------------------
                        # sanity checks
                        if not agent_spawner:
                            raise RuntimeError("missing agent spawner")
                        if not agent_scheduler:
                            raise RuntimeError("missing agent scheduler")
                        if not lrms: raise RuntimeError("missing LRMS")
                        if not mpi_launch_method:
                            raise RuntimeError("missing mpi launch method")
                        if not task_launch_method:
                            raise RuntimeError("missing task launch method")

                        # massage some values
                        debug_level = os.environ.get(
                            'RADICAL_PILOT_AGENT_VERBOSE', logger.level)
                        try:
                            debug_level = int(debug_level)
                        except ValueError:
                            debug_level = {
                                'CRITICAL': 1,
                                'ERROR': 2,
                                'WARNING': 3,
                                'WARN': 3,
                                'INFO': 4,
                                'DEBUG': 5
                            }.get(debug_level, 0)

                        if not queue:
                            queue = default_queue

                        if cleanup and isinstance(cleanup, bool):
                            cleanup = 'luve'  #  l : log files
                            #  u : unit work dirs
                            #  v : virtualenv
                            #  e : everything (== pilot sandbox)
                            #
                            # we never cleanup virtenvs which are not private
                            if virtenv_mode is not 'private':
                                cleanup = cleanup.replace('v', '')

                        sdists = ':'.join(
                            [ru.sdist_name, saga.sdist_name, sdist_name])

                        # set mandatory args
                        bootstrap_args = ""
                        bootstrap_args += " -b '%s'" % sdists
                        bootstrap_args += " -c '%s'" % number_cores
                        bootstrap_args += " -d '%s'" % debug_level
                        bootstrap_args += " -g '%s'" % virtenv
                        bootstrap_args += " -j '%s'" % task_launch_method
                        bootstrap_args += " -k '%s'" % mpi_launch_method
                        bootstrap_args += " -l '%s'" % lrms
                        bootstrap_args += " -m '%s'" % database_hostport
                        bootstrap_args += " -n '%s'" % database_name
                        bootstrap_args += " -o '%s'" % agent_spawner
                        bootstrap_args += " -p '%s'" % pilot_id
                        bootstrap_args += " -q '%s'" % agent_scheduler
                        bootstrap_args += " -r '%s'" % runtime
                        bootstrap_args += " -s '%s'" % session_uid
                        bootstrap_args += " -t '%s'" % agent_type
                        bootstrap_args += " -u '%s'" % virtenv_mode
                        bootstrap_args += " -v '%s'" % rp_version

                        # set optional args
                        if database_auth:
                            bootstrap_args += " -a '%s'" % database_auth
                        if tunnel_bind_device:
                            bootstrap_args += " -D '%s'" % tunnel_bind_device
                        if pre_bootstrap:
                            bootstrap_args += " -e '%s'" % "' -e '".join(
                                pre_bootstrap)
                        if forward_tunnel_endpoint:
                            bootstrap_args += " -f '%s'" % forward_tunnel_endpoint
                        if python_interpreter:
                            bootstrap_args += " -i '%s'" % python_interpreter
                        if cleanup:
                            bootstrap_args += " -x '%s'" % cleanup

                        # ------------------------------------------------------
                        # now that the script is in place and we know where it is,
                        # we can launch the agent
                        js_url = saga.Url(js_endpoint)
                        logger.debug("saga.job.Service ('%s')" % js_url)
                        if js_url in self._shared_worker_data['job_services']:
                            js = self._shared_worker_data['job_services'][
                                js_url]
                        else:
                            js = saga.job.Service(js_url,
                                                  session=self._session)
                            self._shared_worker_data['job_services'][
                                js_url] = js

                        # ------------------------------------------------------
                        # Create SAGA Job description and submit the pilot job

                        jd = saga.job.Description()

                        jd.executable = "/bin/bash"
                        jd.arguments = [
                            "-l pilot_bootstrapper.sh", bootstrap_args
                        ]
                        jd.working_directory = saga.Url(pilot_sandbox).path
                        jd.project = project
                        jd.output = "agent.out"
                        jd.error = "agent.err"
                        jd.total_cpu_count = number_cores
                        jd.wall_time_limit = runtime
                        jd.total_physical_memory = memory
                        jd.queue = queue

                        # Set the SPMD variation only if required
                        if spmd_variation:
                            jd.spmd_variation = spmd_variation

                        if 'RADICAL_PILOT_PROFILE' in os.environ:
                            jd.environment = {'RADICAL_PILOT_PROFILE': 'TRUE'}

                        logger.debug("Bootstrap command line: %s %s" %
                                     (jd.executable, jd.arguments))

                        msg = "Submitting SAGA job with description: %s" % str(
                            jd.as_dict())
                        logentries.append(Logentry(msg, logger=logger.debug))

                        pilotjob = js.create_job(jd)
                        pilotjob.run()

                        # do a quick error check
                        if pilotjob.state == saga.FAILED:
                            raise RuntimeError("SAGA Job state is FAILED.")

                        saga_job_id = pilotjob.id
                        self._shared_worker_data['job_ids'][pilot_id] = [
                            saga_job_id, js_url
                        ]

                        msg = "SAGA job submitted with job id %s" % str(
                            saga_job_id)
                        logentries.append(Logentry(msg, logger=logger.debug))

                        #
                        # ------------------------------------------------------

                        log_dicts = list()
                        for le in logentries:
                            log_dicts.append(le.as_dict())

                        # Update the Pilot's state to 'PENDING_ACTIVE' if SAGA job submission was successful.
                        ts = datetime.datetime.utcnow()
                        ret = pilot_col.update(
                            {
                                "_id": pilot_id,
                                "state": 'Launching'
                            }, {
                                "$set": {
                                    "state": PENDING_ACTIVE,
                                    "saga_job_id": saga_job_id
                                },
                                "$push": {
                                    "statehistory": {
                                        "state": PENDING_ACTIVE,
                                        "timestamp": ts
                                    }
                                },
                                "$pushAll": {
                                    "log": log_dicts
                                }
                            })

                        if ret['n'] == 0:
                            # could not update, probably because the agent is
                            # running already.  Just update state history and
                            # jobid then
                            # FIXME: make sure of the agent state!
                            ret = pilot_col.update({"_id": pilot_id}, {
                                "$set": {
                                    "saga_job_id": saga_job_id
                                },
                                "$push": {
                                    "statehistory": {
                                        "state": PENDING_ACTIVE,
                                        "timestamp": ts
                                    }
                                },
                                "$pushAll": {
                                    "log": log_dicts
                                }
                            })

                    except Exception as e:
                        # Update the Pilot's state 'FAILED'.
                        out, err, log = self._get_pilot_logs(
                            pilot_col, pilot_id)
                        ts = datetime.datetime.utcnow()

                        # FIXME: we seem to be unable to bson/json handle saga
                        # log messages containing an '#'.  This shows up here.
                        # Until we find a clean workaround, make log shorter and
                        # rely on saga logging to reveal the problem.
                        msg = "Pilot launching failed! (%s)" % e
                        logentries.append(Logentry(msg))

                        log_dicts = list()
                        log_messages = list()
                        for le in logentries:
                            log_dicts.append(le.as_dict())
                            log_messages.append(le.message)

                        pilot_col.update(
                            {
                                "_id": pilot_id,
                                "state": {
                                    "$ne": FAILED
                                }
                            }, {
                                "$set": {
                                    "state": FAILED,
                                    "stdout": out,
                                    "stderr": err,
                                    "logfile": log
                                },
                                "$push": {
                                    "statehistory": {
                                        "state": FAILED,
                                        "timestamp": ts
                                    }
                                },
                                "$pushAll": {
                                    "log": log_dicts
                                }
                            })
                        logger.exception('\n'.join(log_messages))

        except SystemExit as e:
            logger.exception(
                "pilot launcher thread caught system exit -- forcing application shutdown"
            )
            import thread
            thread.interrupt_main()
Example #17
0
    def schedule_compute_units(self, pilot_uid, units):
        """Request the scheduling of one or more ComputeUnits on a
           ComputePilot.
        """

        try:
            cu_transfer = list()
            cu_notransfer = list()

            # Get some information about the pilot sandbox from the database.
            pilot_info = self._db.get_pilots(pilot_ids=pilot_uid)
            # TODO: this hack below relies on what?! That there is just one pilot?
            pilot_sandbox = pilot_info[0]['sandbox']

            # Split units into two different lists: the first list contains the CUs
            # that need file transfer and the second list contains the CUs that
            # don't. The latter is added to the pilot directly, while the former
            # is added to the transfer queue.
            for unit in units:

                # Create object for staging status tracking
                unit.FTW_Input_Status = None
                unit.FTW_Input_Directives = []
                unit.Agent_Input_Status = None
                unit.Agent_Input_Directives = []
                unit.FTW_Output_Status = None
                unit.FTW_Output_Directives = []
                unit.Agent_Output_Status = None
                unit.Agent_Output_Directives = []

                # Split the input staging directives over the transfer worker and the agent
                input_sds = unit.description.input_staging
                if not isinstance(input_sds, list):
                    # Ugly, but is a workaround for iterating on attribute interface
                    # TODO: Verify if this piece of code is actually still required
                    if input_sds:
                        input_sds = [input_sds]
                    else:
                        input_sds = []

                for input_sd_entry in input_sds:
                    action = input_sd_entry['action']
                    source = Url(input_sd_entry['source'])
                    target = Url(input_sd_entry['target'])

                    new_sd = {
                        'action': action,
                        'source': str(source),
                        'target': str(target),
                        'flags': input_sd_entry['flags'],
                        'priority': input_sd_entry['priority'],
                        'state': PENDING
                    }

                    if action in [LINK, COPY, MOVE]:
                        unit.Agent_Input_Directives.append(new_sd)
                        unit.Agent_Input_Status = PENDING
                    elif action in [TRANSFER]:
                        if source.scheme and source.scheme != 'file':
                            # If there is a scheme and it is different than "file",
                            # assume a remote pull from the agent
                            unit.Agent_Input_Directives.append(new_sd)
                            unit.Agent_Input_Status = PENDING
                        else:
                            # Transfer from local to sandbox
                            unit.FTW_Input_Directives.append(new_sd)
                            unit.FTW_Input_Status = PENDING
                    else:
                        logger.warn(
                            'Not sure if action %s makes sense for input staging'
                            % action)

                # Split the output staging directives over the transfer worker and the agent
                output_sds = unit.description.output_staging
                if not isinstance(output_sds, list):
                    # Ugly, but is a workaround for iterating on att iface
                    # TODO: Verify if this piece of code is actually still required
                    if output_sds:
                        output_sds = [output_sds]
                    else:
                        output_sds = []

                for output_sds_entry in output_sds:
                    action = output_sds_entry['action']
                    source = Url(output_sds_entry['source'])
                    target = Url(output_sds_entry['target'])

                    new_sd = {
                        'action': action,
                        'source': str(source),
                        'target': str(target),
                        'flags': output_sds_entry['flags'],
                        'priority': output_sds_entry['priority'],
                        'state': PENDING
                    }

                    if action == LINK or action == COPY or action == MOVE:
                        unit.Agent_Output_Directives.append(new_sd)
                        unit.Agent_Output_Status = NEW
                    elif action == TRANSFER:
                        if target.scheme and target.scheme != 'file':
                            # If there is a scheme and it is different than "file",
                            # assume a remote push from the agent
                            unit.Agent_Output_Directives.append(new_sd)
                            unit.Agent_Output_Status = NEW
                        else:
                            # Transfer from sandbox back to local
                            unit.FTW_Output_Directives.append(new_sd)
                            unit.FTW_Output_Status = NEW
                    else:
                        logger.warn(
                            'Not sure if action %s makes sense for output staging'
                            % action)

                if unit.FTW_Input_Directives or unit.Agent_Input_Directives:
                    log = "Scheduled for data transfer to ComputePilot %s." % pilot_uid
                    self._db.set_compute_unit_state(unit.uid,
                                                    PENDING_INPUT_STAGING, log)
                    cu_transfer.append(unit)
                else:
                    cu_notransfer.append(unit)

            # Bulk-add all non-transfer units-
            self._db.assign_compute_units_to_pilot(units=cu_notransfer,
                                                   pilot_uid=pilot_uid,
                                                   pilot_sandbox=pilot_sandbox)

            self._db.assign_compute_units_to_pilot(units=cu_transfer,
                                                   pilot_uid=pilot_uid,
                                                   pilot_sandbox=pilot_sandbox)

            for unit in cu_notransfer:
                log = "Scheduled for execution on ComputePilot %s." % pilot_uid
                self._db.set_compute_unit_state(unit.uid, PENDING_EXECUTION,
                                                log)
                #self._set_state(uid, PENDING_EXECUTION, log)

            logger.info(
                "Scheduled ComputeUnits %s for execution on ComputePilot '%s'."
                % (cu_notransfer, pilot_uid))
        except Exception, e:
            logger.exception('error in unit manager controller (schedule())')
            raise
Example #18
0
    def run(self):
        """run() is called when the process is started via
           PilotManagerController.start().
        """

        # make sure to catch sys.exit (which raises SystemExit)
        try:

            logger.debug(
                "Worker thread (ID: %s[%s]) for UnitManager %s started." %
                (self.name, self.ident, self._um_id))

            # transfer results contains the futures to the results of the
            # asynchronous transfer operations.
            transfer_results = list()

            while not self._stop.is_set():

                # =================================================================
                #
                # Check and update units. This needs to be optimized at
                # some point, i.e., state pulling should be conditional
                # or triggered by a tailable MongoDB cursor, etc.
                unit_list = self._db.get_compute_units(
                    unit_manager_id=self._um_id)
                action = False

                for unit in unit_list:
                    unit_id = str(unit["_id"])

                    new_state = unit["state"]
                    if unit_id in self._shared_data:
                        old_state = self._shared_data[unit_id]["data"]["state"]
                    else:
                        old_state = None
                        self._shared_data_lock.acquire()
                        self._shared_data[unit_id] = {
                            'data': unit,
                            'callbacks': [],
                            'facade_object': None
                        }
                        self._shared_data_lock.release()

                    self._shared_data_lock.acquire()
                    self._shared_data[unit_id]["data"] = unit
                    self._shared_data_lock.release()

                    if new_state != old_state:
                        # On a state change, we fire zee callbacks.
                        logger.info(
                            "RUN ComputeUnit '%s' state changed from '%s' to '%s'."
                            % (unit_id, old_state, new_state))

                        # The state of the unit has changed, We call all
                        # unit-level callbacks to propagate this.
                        self.call_unit_state_callbacks(unit_id, new_state)

                        action = True

                # After the first iteration, we are officially initialized!
                if not self._initialized.is_set():
                    self._initialized.set()

                # sleep a little if this cycle was idle
                if not action:
                    time.sleep(IDLE_TIME)

        except SystemExit as e:
            logger.exception(
                "unit manager controller thread caught system exit -- forcing application shutdown"
            )
            import thread
            thread.interrupt_main()

        finally:
            # shut down the autonomous input / output transfer worker(s)
            for worker in self._input_file_transfer_worker_pool:
                logger.debug("uworker %s stops   itransfer %s" %
                             (self.name, worker.name))
                worker.stop()
                logger.debug("uworker %s stopped itransfer %s" %
                             (self.name, worker.name))

            for worker in self._output_file_transfer_worker_pool:
                logger.debug("uworker %s stops   otransfer %s" %
                             (self.name, worker.name))
                worker.stop()
                logger.debug("uworker %s stopped otransfer %s" %
                             (self.name, worker.name))
Example #19
0
    def register_cancel_pilots_request(self, pilot_ids=None):
        """Registers one or more pilots for cancelation.
        """

        if pilot_ids is None:

            pilot_ids = list()

            for pilot in self._db.get_pilots(pilot_manager_id=self._pm_id) :
                pilot_ids.append (str(pilot["_id"]))


        self._db.send_command_to_pilot(COMMAND_CANCEL_PILOT, pilot_ids=pilot_ids)
        logger.info("Sent 'COMMAND_CANCEL_PILOT' command to pilots %s.", pilot_ids)

        # pilots which are in ACTIVE state should now have time to react on the
        # CANCEL command sent above.  Meanwhile, we'll cancel all pending
        # pilots.  If that is done, we wait a little, say 10 seconds, to give
        # the pilot time to pick up the request and shut down -- but if it does
        # not do that, it will get killed the hard way...
        delayed_cancel = list()

        for pilot_id in pilot_ids :
            if  pilot_id in self._shared_data :

                # read state fomr _shared_data only once, so that it does not
                # change under us...
                old_state = str(self._shared_data[pilot_id]["data"]["state"])

                logger.warn ("actively cancel pilot %s state: %s" % (pilot_id, old_state))
                if  old_state in [DONE, FAILED, CANCELED] :
                    logger.warn ("can't actively cancel pilot %s: already in final state" % pilot_id)

                elif old_state in [PENDING_LAUNCH, LAUNCHING, PENDING_ACTIVE] :
                    if pilot_id in self._shared_worker_data['job_ids'] :

                        try :
                            job_id, js_url = self._shared_worker_data['job_ids'][pilot_id]
                            self._shared_data[pilot_id]["data"]["state"] = CANCELING
                            logger.info ("actively cancel pilot %s (%s, %s)" % (pilot_id, job_id, js_url))

                            js = self._shared_worker_data['job_services'][js_url]
                            job = js.get_job (job_id)
                            job.cancel ()
                        except Exception as e :
                            logger.exception ('pilot cancelation failed')


                    else :
                        logger.warn ("can't actively cancel pilot %s: no job id known" % pilot_id)
                        logger.debug (pprint.pformat (self._shared_worker_data))

                else :
                    logger.debug ("delay to actively cancel pilot %s: state %s" % (pilot_id, old_state))
                    delayed_cancel.append (pilot_id)

            else :
                logger.warn  ("can't actively cancel pilot %s: unknown pilot" % pilot_id)
                logger.debug (pprint.pformat (self._shared_data))

        # now tend to all delayed cancellation requests (ie. active pilots) --
        # if there are any
        if  delayed_cancel :

            # grant some levay to the unruly children...
            time.sleep (10)

            for pilot_id in delayed_cancel :

                if pilot_id in self._shared_worker_data['job_ids'] :

                    try :
                        job_id, js_url = self._shared_worker_data['job_ids'][pilot_id]
                        logger.info ("actively cancel pilot %s (delayed) (%s, %s)" % (pilot_id, job_id, js_url))

                        js = self._shared_worker_data['job_services'][js_url]
                        job = js.get_job (job_id)
                        job.cancel ()
                    except Exception as e :
                        logger.warn ('delayed pilot cancelation failed. '
                                'This is not necessarily a problem.')

                else :
                    logger.warn ("can't actively cancel pilot %s: no job id known (delayed)" % pilot_id)
                    logger.debug (pprint.pformat (self._shared_worker_data))
    def run(self):
        """run() is called when the process is started via
           PilotManagerController.start().
        """

        # make sure to catch sys.exit (which raises SystemExit)
        try :

            logger.debug("Worker thread (ID: %s[%s]) for UnitManager %s started." %
                        (self.name, self.ident, self._um_id))

            # transfer results contains the futures to the results of the
            # asynchronous transfer operations.
            transfer_results = list()

            while not self._stop.is_set():

                # =================================================================
                #
                # Check and update units. This needs to be optimized at
                # some point, i.e., state pulling should be conditional
                # or triggered by a tailable MongoDB cursor, etc.
                unit_list = self._db.get_compute_units(unit_manager_id=self._um_id)
                action    = False

                for unit in unit_list:
                    unit_id = str(unit["_id"])

                    new_state = unit["state"]
                    if unit_id in self._shared_data:
                        old_state = self._shared_data[unit_id]["data"]["state"]
                    else:
                        old_state = None
                        self._shared_data_lock.acquire()
                        self._shared_data[unit_id] = {
                            'data':          unit,
                            'callbacks':     [],
                            'facade_object': None
                        }
                        self._shared_data_lock.release()

                    self._shared_data_lock.acquire()
                    self._shared_data[unit_id]["data"] = unit
                    self._shared_data_lock.release()

                    if new_state != old_state:
                        # On a state change, we fire zee callbacks.
                        logger.info("RUN ComputeUnit '%s' state changed from '%s' to '%s'." % (unit_id, old_state, new_state))

                        # The state of the unit has changed, We call all
                        # unit-level callbacks to propagate this.
                        self.call_unit_state_callbacks(unit_id, new_state)

                        action = True

                # After the first iteration, we are officially initialized!
                if not self._initialized.is_set():
                    self._initialized.set()

                # sleep a little if this cycle was idle
                if  not action :
                    time.sleep(IDLE_TIME)


        except SystemExit as e :
            logger.exception ("unit manager controller thread caught system exit -- forcing application shutdown")
            import thread
            thread.interrupt_main ()


        finally :
            # shut down the autonomous input / output transfer worker(s)
            for worker in self._input_file_transfer_worker_pool:
                logger.debug("uworker %s stops   itransfer %s" % (self.name, worker.name))
                worker.stop ()
                logger.debug("uworker %s stopped itransfer %s" % (self.name, worker.name))

            for worker in self._output_file_transfer_worker_pool:
                logger.debug("uworker %s stops   otransfer %s" % (self.name, worker.name))
                worker.stop ()
                logger.debug("uworker %s stopped otransfer %s" % (self.name, worker.name))
Example #21
0
    def run(self):
        """Starts the process when Process.start() is called.
        """

        # make sure to catch sys.exit (which raises SystemExit)
        try :

            # Try to connect to the database and create a tailable cursor.
            try:
                connection = self.db_connection_info.get_db_handle()
                db = connection[self.db_connection_info.dbname]
                um_col = db["%s.cu" % self.db_connection_info.session_id]
                logger.debug("Connected to MongoDB. Serving requests for UnitManager %s." % self.unit_manager_id)

            except Exception as e:
                logger.exception("Connection error: %s" % e)
                return

            while not self._stop.is_set():
                compute_unit = None

                # See if we can find a ComputeUnit that is waiting for
                # output file transfer.
                ts = datetime.datetime.utcnow()
                compute_unit = um_col.find_and_modify(
                    query={"unitmanager": self.unit_manager_id,
                           "FTW_Output_Status": PENDING},
                    update={"$set" : {"FTW_Output_Status": EXECUTING,
                                      "state": STAGING_OUTPUT},
                            "$push": {"statehistory": {"state": STAGING_OUTPUT, "timestamp": ts}}},
                    limit=BULK_LIMIT
                )
                # FIXME: AM: find_and_modify is not bulkable!
                state = STAGING_OUTPUT

                #logger.info("OFTW after finding pending cus")
                if compute_unit is None:
                    #logger.info("OFTW no cus, sleep")
                    # Sleep a bit if no new units are available.
                    time.sleep(IDLE_TIME)
                else:
                    logger.info("OFTW cu found, progressing ...")
                    compute_unit_id = None
                    try:
                        # We have found a new CU. Now we can process the transfer
                        # directive(s) wit SAGA.
                        compute_unit_id = str(compute_unit["_id"])
                        remote_sandbox = compute_unit["sandbox"]
                        staging_directives = compute_unit["FTW_Output_Directives"]

                        logger.info("Processing output file transfers for ComputeUnit %s" % compute_unit_id)
                        # Loop over all staging directives and execute them.
                        for sd in staging_directives:

                            # Check if there was a cancel request
                            state_doc = um_col.find_one(
                                {"_id": compute_unit_id},
                                fields=["state"]
                            )
                            if state_doc['state'] == CANCELED:
                                logger.info("Compute Unit Canceled, interrupting output file transfers.")
                                state = CANCELED
                                break

                            action = sd['action']
                            source = sd['source']
                            target = sd['target']
                            flags  = sd['flags']

                            # Mark the beginning of transfer this StagingDirective
                            um_col.find_and_modify(
                                query={"_id" : compute_unit_id,
                                       'FTW_Output_Status': EXECUTING,
                                       'FTW_Output_Directives.state': PENDING,
                                       'FTW_Output_Directives.source': sd['source'],
                                       'FTW_Output_Directives.target': sd['target'],
                                       },
                                update={'$set': {'FTW_Output_Directives.$.state': EXECUTING},
                                        '$push': {'log': {
                                            'timestamp': datetime.datetime.utcnow(),
                                            'message'  : 'Starting transfer of %s' % source}}
                                }
                            )

                            abs_source = "%s/%s" % (remote_sandbox, source)

                            if os.path.basename(target) == target:
                                abs_target = "file://localhost%s" % os.path.join(os.getcwd(), target)
                            else:
                                abs_target = "file://localhost%s" % os.path.abspath(target)

                            log_msg = "Transferring output file %s -> %s" % (abs_source, abs_target)
                            logger.debug(log_msg)

                            logger.debug ("saga.fs.File ('%s')" % saga.Url(abs_source))
                            output_file = saga.filesystem.File(saga.Url(abs_source),
                                session=self._session
                            )

                            if CREATE_PARENTS in flags:
                                copy_flags = saga.filesystem.CREATE_PARENTS
                            else:
                                copy_flags = 0
                            logger.debug ("saga.fs.File.copy ('%s')" % saga.Url(abs_target))
                            output_file.copy(saga.Url(abs_target), flags=copy_flags)
                            output_file.close()

                            # If all went fine, update the state of this StagingDirective to Done
                            um_col.find_and_modify(
                                query={"_id" : compute_unit_id,
                                       'FTW_Output_Status': EXECUTING,
                                       'FTW_Output_Directives.state': EXECUTING,
                                       'FTW_Output_Directives.source': sd['source'],
                                       'FTW_Output_Directives.target': sd['target'],
                                       },
                                update={'$set': {'FTW_Output_Directives.$.state': DONE},
                                        '$push': {'log': {
                                            'timestamp': datetime.datetime.utcnow(),
                                            'message'  : log_msg}}
                                }
                            )

                    except Exception as e :
                        # Update the CU's state to 'FAILED'.
                        ts = datetime.datetime.utcnow()
                        log_message = "Output transfer failed: %s" % e
                        # TODO: not only mark the CU as failed, but also the specific Directive
                        um_col.update({'_id': compute_unit_id}, {
                            '$set': {'state': FAILED},
                            '$push': {
                                'statehistory': {'state': FAILED, 'timestamp': ts},
                                'log': {'message': log_message, 'timestamp': ts}
                            }
                        })
                        logger.exception (log_message)


                # Code below is only to be run by the "first" or only worker
                if self._worker_number > 1:
                    continue

                # If the CU was canceled we can skip the remainder of this loop.
                if state == CANCELED:
                    continue

                #
                # Check to see if there are more active Directives, if not, we are Done
                #
                cursor_w = um_col.find({"unitmanager": self.unit_manager_id,
                                        "$or": [ {"Agent_Output_Status": EXECUTING},
                                                 {"FTW_Output_Status": EXECUTING}
                                        ]
                }
                )
                # Iterate over all the returned CUs (if any)
                for cu in cursor_w:
                    # See if there are any FTW Output Directives still pending
                    if cu['FTW_Output_Status'] == EXECUTING and \
                            not any(d['state'] == EXECUTING or d['state'] == PENDING for d in cu['FTW_Output_Directives']):
                        # All Output Directives for this FTW are done, mark the CU accordingly
                        um_col.update({"_id": cu["_id"]},
                                      {'$set': {'FTW_Output_Status': DONE},
                                       '$push': {'log': {
                                           'timestamp': datetime.datetime.utcnow(),
                                           'message'  : 'All FTW output staging directives done - %d.' % self._worker_number}}
                                       }
                        )

                    # See if there are any Agent Output Directives still pending
                    if cu['Agent_Output_Status'] == EXECUTING and \
                            not any(d['state'] == EXECUTING or d['state'] == PENDING for d in cu['Agent_Output_Directives']):
                        # All Output Directives for this Agent are done, mark the CU accordingly
                        um_col.update({"_id": cu["_id"]},
                                      {'$set': {'Agent_Output_Status': DONE},
                                       '$push': {'log': {
                                           'timestamp': datetime.datetime.utcnow(),
                                           'message'  : 'All Agent Output Staging Directives done-%d.' % self._worker_number}}
                                      }
                        )

                #
                # Check for all CUs if both Agent and FTW staging is done, we can then mark the CU Done
                #
                ts = datetime.datetime.utcnow()
                um_col.find_and_modify(
                    query={"unitmanager": self.unit_manager_id,
                           # TODO: Now that our state model is linear,
                           # we probably don't need to check Agent_Output_Status anymore.
                           # Given that it is not updates by the agent currently, disable it here.
                           #"Agent_Output_Status": { "$in": [ None, DONE ] },
                           "FTW_Output_Status": { "$in": [ None, DONE ] },
                           "state": STAGING_OUTPUT
                    },
                    update={"$set": {
                        "state": DONE
                    },
                            "$push": {
                                "statehistory": {"state": DONE, "timestamp": ts}
                            }
                    }
                )

        except SystemExit as e :
            logger.exception("output file transfer thread caught system exit -- forcing application shutdown")
            import thread
            thread.interrupt_main ()
    def run(self):
        """Starts the process when Process.start() is called.
        """

        # make sure to catch sys.exit (which raises SystemExit)
        try:
            # Get directory where this module lives
            mod_dir = os.path.dirname(os.path.realpath(__file__))

            # Try to connect to the database
            try:
                connection = self.db_connection_info.get_db_handle()
                db = connection[self.db_connection_info.dbname]
                pilot_col = db["%s.p" % self.db_connection_info.session_id]
                logger.debug("Connected to MongoDB. Serving requests for PilotManager %s." % self.pilot_manager_id)

            except Exception as e:
                logger.exception("Connection error: %s" % e)
                return

            last_job_check = time.time()

            while not self._stop.is_set():

                # Periodically, we pull up all ComputePilots that are pending
                # execution or were last seen executing and check if the corresponding
                # SAGA job is still pending in the queue. If that is not the case,
                # we assume that the job has failed for some reasons and update
                # the state of the ComputePilot accordingly.
                if last_job_check + JOB_CHECK_INTERVAL < time.time():
                    last_job_check = time.time()
                    self.check_pilot_states(pilot_col)

                # See if we can find a ComputePilot that is waiting to be launched.
                # If we find one, we use SAGA to create a job service, a job
                # description and a job that is then send to the local or remote
                # queueing system. If this succedes, we set the ComputePilot's
                # state to pending, otherwise to failed.
                compute_pilot = None

                ts = datetime.datetime.utcnow()
                compute_pilot = pilot_col.find_and_modify(
                    query={"pilotmanager": self.pilot_manager_id, "state": PENDING_LAUNCH},
                    update={
                        "$set": {"state": LAUNCHING},
                        "$push": {"statehistory": {"state": LAUNCHING, "timestamp": ts}},
                    },
                )

                if not compute_pilot:
                    time.sleep(IDLE_TIMER)

                else:
                    try:
                        # ------------------------------------------------------
                        #
                        # LAUNCH THE PILOT AGENT VIA SAGA
                        #
                        logentries = []
                        pilot_id = str(compute_pilot["_id"])

                        logger.info("Launching ComputePilot %s" % pilot_id)

                        # ------------------------------------------------------
                        # Database connection parameters
                        session_uid = self.db_connection_info.session_id
                        database_url = self.db_connection_info.dburl
                        database_name = self.db_connection_info.dbname
                        database_auth = self.db_connection_info.dbauth

                        # ------------------------------------------------------
                        # pilot description and resource configuration
                        number_cores = compute_pilot["description"]["cores"]
                        runtime = compute_pilot["description"]["runtime"]
                        queue = compute_pilot["description"]["queue"]
                        project = compute_pilot["description"]["project"]
                        cleanup = compute_pilot["description"]["cleanup"]
                        resource_key = compute_pilot["description"]["resource"]
                        schema = compute_pilot["description"]["access_schema"]
                        memory = compute_pilot["description"]["memory"]
                        pilot_sandbox = compute_pilot["sandbox"]
                        global_sandbox = compute_pilot["global_sandbox"]

                        # we expand and exchange keys in the resource config,
                        # depending on the selected schema so better use a deep
                        # copy..
                        resource_cfg = self._session.get_resource_config(resource_key, schema)

                        # import pprint
                        # pprint.pprint (resource_cfg)

                        # ------------------------------------------------------
                        # get parameters from cfg, set defaults where needed
                        agent_mongodb_endpoint = resource_cfg.get("agent_mongodb_endpoint", database_url)
                        agent_spawner = resource_cfg.get("agent_spawner", DEFAULT_AGENT_SPAWNER)
                        agent_type = resource_cfg.get("agent_type", DEFAULT_AGENT_TYPE)
                        agent_scheduler = resource_cfg.get("agent_scheduler")
                        tunnel_bind_device = resource_cfg.get("tunnel_bind_device")
                        default_queue = resource_cfg.get("default_queue")
                        forward_tunnel_endpoint = resource_cfg.get("forward_tunnel_endpoint")
                        js_endpoint = resource_cfg.get("job_manager_endpoint")
                        lrms = resource_cfg.get("lrms")
                        mpi_launch_method = resource_cfg.get("mpi_launch_method")
                        pre_bootstrap = resource_cfg.get("pre_bootstrap")
                        python_interpreter = resource_cfg.get("python_interpreter")
                        spmd_variation = resource_cfg.get("spmd_variation")
                        task_launch_method = resource_cfg.get("task_launch_method")
                        rp_version = resource_cfg.get("rp_version", DEFAULT_RP_VERSION)
                        virtenv_mode = resource_cfg.get("virtenv_mode", DEFAULT_VIRTENV_MODE)
                        virtenv = resource_cfg.get("virtenv", DEFAULT_VIRTENV)
                        stage_cacerts = resource_cfg.get("stage_cacerts", "False")

                        if stage_cacerts.lower() == "true":
                            stage_cacerts = True
                        else:
                            stage_cacerts = False

                        # expand variables in virtenv string
                        virtenv = virtenv % {
                            "pilot_sandbox": saga.Url(pilot_sandbox).path,
                            "global_sandbox": saga.Url(global_sandbox).path,
                        }

                        # Check for deprecated global_virtenv
                        global_virtenv = resource_cfg.get("global_virtenv")
                        if global_virtenv:
                            logger.warn("'global_virtenv' keyword is deprecated -- use 'virtenv' and 'virtenv_mode'")
                            virtenv = global_virtenv
                            virtenv_mode = "use"

                        # set default scheme, host, port and dbname if not set
                        db_url = saga.Url(agent_mongodb_endpoint)
                        if not db_url.scheme:
                            db_url.scheme = "mongodb"
                        if not db_url.host:
                            db_url.host = "localhost"
                        if not db_url.port:
                            db_url.port = 27017
                        if not database_name:
                            database_name = "radicalpilot"

                        # Create a host:port string for use by the bootstrapper.
                        database_hostport = "%s:%d" % (db_url.host, db_url.port)

                        # ------------------------------------------------------
                        # Copy the bootstrap shell script.  This also creates
                        # the sandbox. We use always "default_bootstrapper.sh"
                        bootstrapper = "default_bootstrapper.sh"
                        bootstrapper_path = os.path.abspath("%s/../bootstrapper/%s" % (mod_dir, bootstrapper))

                        msg = "Using bootstrapper %s" % bootstrapper_path
                        logentries.append(Logentry(msg, logger=logger.info))

                        bs_script_url = saga.Url("file://localhost/%s" % bootstrapper_path)
                        bs_script_tgt = saga.Url("%s/pilot_bootstrapper.sh" % pilot_sandbox)

                        msg = "Copying bootstrapper '%s' to agent sandbox (%s)." % (bs_script_url, bs_script_tgt)
                        logentries.append(Logentry(msg, logger=logger.debug))

                        bs_script = saga.filesystem.File(bs_script_url, session=self._session)
                        bs_script.copy(bs_script_tgt, flags=saga.filesystem.CREATE_PARENTS)
                        bs_script.close()

                        # ------------------------------------------------------
                        # the version of the agent is derived from
                        # rp_version, which has the following format
                        # and interpretation:
                        #
                        # case rp_version:
                        #   @<token>:
                        #   @tag/@branch/@commit: # no sdist staging
                        #       git clone $github_base radical.pilot.src
                        #       (cd radical.pilot.src && git checkout token)
                        #       pip install -t $VIRTENV/rp_install/ radical.pilot.src
                        #       rm -rf radical.pilot.src
                        #       export PYTHONPATH=$VIRTENV/rp_install:$PYTHONPATH
                        #
                        #   release: # no sdist staging
                        #       pip install -t $VIRTENV/rp_install radical.pilot
                        #       export PYTHONPATH=$VIRTENV/rp_install:$PYTHONPATH
                        #
                        #   local: # needs sdist staging
                        #       tar zxf $sdist.tgz
                        #       pip install -t $VIRTENV/rp_install $sdist/
                        #       export PYTHONPATH=$VIRTENV/rp_install:$PYTHONPATH
                        #
                        #   debug: # needs sdist staging
                        #       tar zxf $sdist.tgz
                        #       pip install -t $SANDBOX/rp_install $sdist/
                        #       export PYTHONPATH=$SANDBOX/rp_install:$PYTHONPATH
                        #
                        #   installed: # no sdist staging
                        #       true
                        # esac
                        #
                        # virtenv_mode
                        #   private : error  if ve exists, otherwise create, then use
                        #   update  : update if ve exists, otherwise create, then use
                        #   create  : use    if ve exists, otherwise create, then use
                        #   use     : use    if ve exists, otherwise error,  then exit
                        #   recreate: delete if ve exists, otherwise create, then use
                        #
                        # examples   :
                        #   [email protected]
                        #   virtenv@devel
                        #   virtenv@release
                        #   virtenv@installed
                        #   stage@local
                        #   stage@/tmp/my_agent.py
                        #
                        # Note that some combinations may be invalid,
                        # specifically in the context of virtenv_mode.  If, for
                        # example, virtenv_mode is 'use', then the 'virtenv:tag'
                        # will not make sense, as the virtenv is not updated.
                        # In those cases, the virtenv_mode is honored, and
                        # a warning is printed.
                        #
                        # Also, the 'stage' mode can only be combined with the
                        # 'local' source, or with a path to the agent (relative
                        # to mod_dir, or absolute).
                        #
                        # A rp_version which does not adhere to the
                        # above syntax is ignored, and the fallback stage@local
                        # is used.

                        if not rp_version.startswith("@") and not rp_version in ["installed", "local", "debug"]:
                            raise ValueError("invalid rp_version '%s'" % rp_version)

                        stage_sdist = True
                        if rp_version in ["installed", "release"]:
                            stage_sdist = False

                        if rp_version.startswith("@"):
                            stage_sdist = False
                            rp_version = rp_version[1:]  # strip '@'

                        # ------------------------------------------------------
                        # Copy the rp sdist if needed.  We actually also stage
                        # the sdists for radical.utils and radical.saga, so that
                        # we have the complete stack to install...
                        if stage_sdist:

                            for path in [ru.sdist_path, saga.sdist_path, sdist_path]:

                                sdist_url = saga.Url("file://localhost/%s" % path)
                                msg = "Copying sdist '%s' to sdist sandbox (%s)." % (sdist_url, pilot_sandbox)
                                logentries.append(Logentry(msg, logger=logger.debug))

                                sdist_file = saga.filesystem.File(sdist_url)
                                sdist_file.copy("%s/" % (str(pilot_sandbox)))
                                sdist_file.close()

                        # ------------------------------------------------------
                        # some machines cannot run pip due to outdated ca certs.
                        # For those, we also stage an updated cert bundle
                        if stage_cacerts:
                            cc_path = os.path.abspath("%s/../bootstrapper/%s" % (mod_dir, "cacert.pem.gz"))

                            cc_script_url = saga.Url("file://localhost/%s" % cc_path)
                            cc_script_tgt = saga.Url("%s/cacert.pem.gz" % pilot_sandbox)

                            cc_script = saga.filesystem.File(cc_script_url, session=self._session)
                            cc_script.copy(cc_script_tgt, flags=saga.filesystem.CREATE_PARENTS)
                            cc_script.close()

                        # ------------------------------------------------------
                        # sanity checks
                        if not agent_spawner:
                            raise RuntimeError("missing agent spawner")
                        if not agent_scheduler:
                            raise RuntimeError("missing agent scheduler")
                        if not lrms:
                            raise RuntimeError("missing LRMS")
                        if not mpi_launch_method:
                            raise RuntimeError("missing mpi launch method")
                        if not task_launch_method:
                            raise RuntimeError("missing task launch method")

                        # massage some values
                        debug_level = os.environ.get("RADICAL_PILOT_AGENT_VERBOSE", logger.level)
                        try:
                            debug_level = int(debug_level)
                        except ValueError:
                            debug_level = {
                                "CRITICAL": 1,
                                "ERROR": 2,
                                "WARNING": 3,
                                "WARN": 3,
                                "INFO": 4,
                                "DEBUG": 5,
                            }.get(debug_level, 0)

                        if not queue:
                            queue = default_queue

                        if cleanup and isinstance(cleanup, bool):
                            cleanup = "luve"  #  l : log files
                            #  u : unit work dirs
                            #  v : virtualenv
                            #  e : everything (== pilot sandbox)
                            #
                            # we never cleanup virtenvs which are not private
                            if virtenv_mode is not "private":
                                cleanup = cleanup.replace("v", "")

                        sdists = ":".join([ru.sdist_name, saga.sdist_name, sdist_name])

                        # set mandatory args
                        bootstrap_args = ""
                        bootstrap_args += " -b '%s'" % sdists
                        bootstrap_args += " -c '%s'" % number_cores
                        bootstrap_args += " -d '%s'" % debug_level
                        bootstrap_args += " -g '%s'" % virtenv
                        bootstrap_args += " -j '%s'" % task_launch_method
                        bootstrap_args += " -k '%s'" % mpi_launch_method
                        bootstrap_args += " -l '%s'" % lrms
                        bootstrap_args += " -m '%s'" % database_hostport
                        bootstrap_args += " -n '%s'" % database_name
                        bootstrap_args += " -o '%s'" % agent_spawner
                        bootstrap_args += " -p '%s'" % pilot_id
                        bootstrap_args += " -q '%s'" % agent_scheduler
                        bootstrap_args += " -r '%s'" % runtime
                        bootstrap_args += " -s '%s'" % session_uid
                        bootstrap_args += " -t '%s'" % agent_type
                        bootstrap_args += " -u '%s'" % virtenv_mode
                        bootstrap_args += " -v '%s'" % rp_version

                        # set optional args
                        if database_auth:
                            bootstrap_args += " -a '%s'" % database_auth
                        if tunnel_bind_device:
                            bootstrap_args += " -D '%s'" % tunnel_bind_device
                        if pre_bootstrap:
                            bootstrap_args += " -e '%s'" % "' -e '".join(pre_bootstrap)
                        if forward_tunnel_endpoint:
                            bootstrap_args += " -f '%s'" % forward_tunnel_endpoint
                        if python_interpreter:
                            bootstrap_args += " -i '%s'" % python_interpreter
                        if cleanup:
                            bootstrap_args += " -x '%s'" % cleanup

                        # ------------------------------------------------------
                        # now that the script is in place and we know where it is,
                        # we can launch the agent
                        js_url = saga.Url(js_endpoint)
                        logger.debug("saga.job.Service ('%s')" % js_url)
                        if js_url in self._shared_worker_data["job_services"]:
                            js = self._shared_worker_data["job_services"][js_url]
                        else:
                            js = saga.job.Service(js_url, session=self._session)
                            self._shared_worker_data["job_services"][js_url] = js

                        # ------------------------------------------------------
                        # Create SAGA Job description and submit the pilot job

                        jd = saga.job.Description()

                        jd.executable = "/bin/bash"
                        jd.arguments = ["-l pilot_bootstrapper.sh", bootstrap_args]
                        jd.working_directory = saga.Url(pilot_sandbox).path
                        jd.project = project
                        jd.output = "agent.out"
                        jd.error = "agent.err"
                        jd.total_cpu_count = number_cores
                        jd.wall_time_limit = runtime
                        jd.total_physical_memory = memory
                        jd.queue = queue

                        # Set the SPMD variation only if required
                        if spmd_variation:
                            jd.spmd_variation = spmd_variation

                        if "RADICAL_PILOT_PROFILE" in os.environ:
                            jd.environment = {"RADICAL_PILOT_PROFILE": "TRUE"}

                        logger.debug("Bootstrap command line: %s %s" % (jd.executable, jd.arguments))

                        msg = "Submitting SAGA job with description: %s" % str(jd.as_dict())
                        logentries.append(Logentry(msg, logger=logger.debug))

                        pilotjob = js.create_job(jd)
                        pilotjob.run()

                        # do a quick error check
                        if pilotjob.state == saga.FAILED:
                            raise RuntimeError("SAGA Job state is FAILED.")

                        saga_job_id = pilotjob.id
                        self._shared_worker_data["job_ids"][pilot_id] = [saga_job_id, js_url]

                        msg = "SAGA job submitted with job id %s" % str(saga_job_id)
                        logentries.append(Logentry(msg, logger=logger.debug))

                        #
                        # ------------------------------------------------------

                        log_dicts = list()
                        for le in logentries:
                            log_dicts.append(le.as_dict())

                        # Update the Pilot's state to 'PENDING_ACTIVE' if SAGA job submission was successful.
                        ts = datetime.datetime.utcnow()
                        ret = pilot_col.update(
                            {"_id": pilot_id, "state": "Launching"},
                            {
                                "$set": {"state": PENDING_ACTIVE, "saga_job_id": saga_job_id},
                                "$push": {"statehistory": {"state": PENDING_ACTIVE, "timestamp": ts}},
                                "$pushAll": {"log": log_dicts},
                            },
                        )

                        if ret["n"] == 0:
                            # could not update, probably because the agent is
                            # running already.  Just update state history and
                            # jobid then
                            # FIXME: make sure of the agent state!
                            ret = pilot_col.update(
                                {"_id": pilot_id},
                                {
                                    "$set": {"saga_job_id": saga_job_id},
                                    "$push": {"statehistory": {"state": PENDING_ACTIVE, "timestamp": ts}},
                                    "$pushAll": {"log": log_dicts},
                                },
                            )

                    except Exception as e:
                        # Update the Pilot's state 'FAILED'.
                        out, err, log = self._get_pilot_logs(pilot_col, pilot_id)
                        ts = datetime.datetime.utcnow()

                        # FIXME: we seem to be unable to bson/json handle saga
                        # log messages containing an '#'.  This shows up here.
                        # Until we find a clean workaround, make log shorter and
                        # rely on saga logging to reveal the problem.
                        msg = "Pilot launching failed! (%s)" % e
                        logentries.append(Logentry(msg))

                        log_dicts = list()
                        log_messages = list()
                        for le in logentries:
                            log_dicts.append(le.as_dict())
                            log_messages.append(le.message)

                        pilot_col.update(
                            {"_id": pilot_id, "state": {"$ne": FAILED}},
                            {
                                "$set": {"state": FAILED, "stdout": out, "stderr": err, "logfile": log},
                                "$push": {"statehistory": {"state": FAILED, "timestamp": ts}},
                                "$pushAll": {"log": log_dicts},
                            },
                        )
                        logger.exception("\n".join(log_messages))

        except SystemExit as e:
            logger.exception("pilot launcher thread caught system exit -- forcing application shutdown")
            import thread

            thread.interrupt_main()