Python exception Examples

Programming Language: Python

Namespace/Package Name: radical.pilot.utils.logger.logger

Method/Function: exception

Examples at hotexamples.com: 22

Python exception - 22 examples found. These are the top rated real world Python examples of radical.pilot.utils.logger.logger.exception extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

    def get_compute_pilot_data(self, pilot_ids=None):
        """Returns the raw data (json dicts) of one or more ComputePilots
           registered with this Worker / PilotManager
        """
        # Wait for the initialized event to assert proper operation.
        self._initialized.wait()

        try:
            if  pilot_ids is None:
                pilot_ids = self._shared_data.keys ()

            return_list_type = True
            if not isinstance(pilot_ids, list):
                return_list_type = False
                pilot_ids = [pilot_ids]

            data = list()
            for pilot_id in pilot_ids:
                data.append(self._shared_data[pilot_id]['data'])

            if  return_list_type :
                return data
            else :
                return data[0]

        except KeyError as e:
            logger.exception ("Unknown Pilot ID %s : %s" % (pilot_id, e))
            raise

Example #2

Show file

    def call_unit_state_callbacks(self, unit_id, new_state):
        """Wrapper function to call all all relevant callbacks, on unit-level
        as well as manager-level.
        """

        # this is the point where, at the earliest, the application could have
        # been notified about unit state changes.  So we record that event.
        if not unit_id in self._callback_histories:
            self._callback_histories[unit_id] = list()
        self._callback_histories[unit_id].append({
            'timestamp':
            datetime.datetime.utcnow(),
            'state':
            new_state
        })

        for [cb, cb_data] in self._shared_data[unit_id]['callbacks']:
            try:

                if self._shared_data[unit_id]['facade_object']:
                    if cb_data:
                        cb(self._shared_data[unit_id]['facade_object'],
                           new_state, cb_data)
                    else:
                        cb(self._shared_data[unit_id]['facade_object'],
                           new_state)
                else:
                    logger.error("Couldn't call callback (no pilot instance)")
            except Exception as e:
                logger.exception("Couldn't call callback function %s" % e)
                raise

        # If we have any manager-level callbacks registered, we
        # call those as well!
        if not UNIT_STATE in self._manager_callbacks:
            self._manager_callbacks[UNIT_STATE] = list()

        for [cb, cb_data] in self._manager_callbacks[UNIT_STATE]:
            if not self._shared_data[unit_id]['facade_object']:
                logger.warning('skip cb for incomple unit (%s: %s)' %
                               (unit_id, new_state))
                break

            try:
                if cb_data:
                    cb(self._shared_data[unit_id]['facade_object'], new_state,
                       cb_data)
                else:
                    cb(self._shared_data[unit_id]['facade_object'], new_state)
            except Exception as e:
                logger.exception("Couldn't call callback function %s" % e)
                raise

        # If we meet a final state, we record the object's callback history for
        # later evaluation.
        if new_state in (DONE, FAILED, CANCELED):
            self._db.publish_compute_unit_callback_history(
                unit_id, self._callback_histories[unit_id])

Example #3

Show file

File: unit_manager_controller.py Project: mingtaiha/radical.pilot

    def unschedule_compute_units(self, units):
        """
        set the unit state to UNSCHEDULED
        """

        try:
            unit_ids = [unit.uid for unit in units]
            self._db.set_compute_unit_state(unit_ids, UNSCHEDULED, "unit remains unscheduled")

        except Exception, e:
            logger.exception ('error in unit manager controller (unschedule())')
            raise

Example #4

Show file

File: unit_manager_controller.py Project: mingtaiha/radical.pilot

    def call_unit_state_callbacks(self, unit_id, new_state):
        """Wrapper function to call all all relevant callbacks, on unit-level
        as well as manager-level.
        """

        # this is the point where, at the earliest, the application could have
        # been notified about unit state changes.  So we record that event.
        if  not unit_id in self._callback_histories :
            self._callback_histories[unit_id] = list()
        self._callback_histories[unit_id].append (
                {'timestamp' : datetime.datetime.utcnow(), 
                 'state'     : new_state})

        for [cb, cb_data] in self._shared_data[unit_id]['callbacks']:
            try:

                if self._shared_data[unit_id]['facade_object'] :
                    if  cb_data :
                        cb(self._shared_data[unit_id]['facade_object'], new_state, cb_data)
                    else :
                        cb(self._shared_data[unit_id]['facade_object'], new_state)
                else :
                    logger.error("Couldn't call callback (no pilot instance)")
            except Exception as e:
                logger.exception(
                    "Couldn't call callback function %s" % e)
                raise

        # If we have any manager-level callbacks registered, we
        # call those as well!
        if  not UNIT_STATE in self._manager_callbacks :
            self._manager_callbacks[UNIT_STATE] = list()

        for [cb, cb_data] in self._manager_callbacks[UNIT_STATE]:
            if not self._shared_data[unit_id]['facade_object'] :
                logger.warning ('skip cb for incomple unit (%s: %s)' % (unit_id, new_state))
                break

            try:
                if  cb_data :
                    cb(self._shared_data[unit_id]['facade_object'], new_state, cb_data)
                else :
                    cb(self._shared_data[unit_id]['facade_object'], new_state)
            except Exception as e:
                logger.exception(
                    "Couldn't call callback function %s" % e)
                raise

        # If we meet a final state, we record the object's callback history for
        # later evaluation.
        if  new_state in (DONE, FAILED, CANCELED) :
            self._db.publish_compute_unit_callback_history (unit_id, self._callback_histories[unit_id])

Example #5

Show file

    def unschedule_compute_units(self, units):
        """
        set the unit state to UNSCHEDULED
        """

        try:
            unit_ids = [unit.uid for unit in units]
            self._db.set_compute_unit_state(unit_ids, UNSCHEDULED,
                                            "unit remains unscheduled")

        except Exception, e:
            logger.exception('error in unit manager controller (unschedule())')
            raise

Example #6

Show file

File: unit_manager_controller.py Project: mingtaiha/radical.pilot

    def fire_manager_callback(self, metric, obj, value):
        """Fire a manager-level callback.
        """
        if  not metric in self._manager_callbacks :
            self._manager_callbacks[metric] = list()

        for [cb, cb_data] in self._manager_callbacks[metric] :
            try:
                if  cb_data :
                    cb (obj, value, cb_data)
                else :
                    cb (obj, value)
            except Exception as e:
                logger.exception ("Couldn't call '%s' callback function %s: %s" \
                           % (metric, cb, e))
                raise

Example #7

Show file

    def fire_manager_callback(self, metric, obj, value):
        """Fire a manager-level callback.
        """
        if not metric in self._manager_callbacks:
            self._manager_callbacks[metric] = list()

        for [cb, cb_data] in self._manager_callbacks[metric]:
            try:
                if cb_data:
                    cb(obj, value, cb_data)
                else:
                    cb(obj, value)
            except Exception as e:
                logger.exception ("Couldn't call '%s' callback function %s: %s" \
                           % (metric, cb, e))
                raise

Example #8

Show file

File: unit_manager.py Project: JensTimmerman/radical.pilot

    def submit_units(self, unit_descriptions):
        """Submits on or more :class:`radical.pilot.ComputeUnit` instances to the
        unit manager.

        **Arguments:**

            * **unit_descriptions** [:class:`radical.pilot.ComputeUnitDescription`
              or list of :class:`radical.pilot.ComputeUnitDescription`]: The
              description of the compute unit instance(s) to create.

        **Returns:**

              * A list of :class:`radical.pilot.ComputeUnit` objects.

        **Raises:**

            * :class:`radical.pilot.PilotException`
        """

        if not self._uid:
            raise IncorrectState(msg="Invalid object instance.")

        return_list_type = True
        if not isinstance(unit_descriptions, list):
            return_list_type = False
            unit_descriptions = [unit_descriptions]

        # we return a list of compute units
        ret = list()

        # the scheduler will return a dictionary of the form:
        #   {
        #     ud_1 : pilot_id_a,
        #     ud_2 : pilot_id_b
        #     ...
        #   }
        #
        # The scheduler may not be able to schedule some units - those will
        # have 'None' as pilot ID.

        units = list()
        for ud in unit_descriptions:

            units.append(
                ComputeUnit.create(unit_description=ud,
                                   unit_manager_obj=self,
                                   local_state=SCHEDULING))

        self._worker.publish_compute_units(units=units)

        schedule = None
        try:
            schedule = self._scheduler.schedule(units=units)

        except Exception as e:
            logger.exception("Internal error - unit scheduler failed")
            raise

        self.handle_schedule(schedule)

        if return_list_type:
            return units
        else:
            return units[0]

Example #9

Show file

    def __init__(self,
                 database_url=None,
                 database_name="radicalpilot",
                 uid=None,
                 name=None):
        """Creates a new or reconnects to an exising session.

        If called without a uid, a new Session instance is created and 
        stored in the database. If uid is set, an existing session is 
        retrieved from the database. 

        **Arguments:**
            * **database_url** (`string`): The MongoDB URL.  If none is given,
              RP uses the environment variable RADICAL_PILOT_DBURL.  If that is
              not set, an error will be raises.

            * **database_name** (`string`): An alternative database name 
              (default: 'radicalpilot').

            * **uid** (`string`): If uid is set, we try 
              re-connect to an existing session instead of creating a new one.

            * **name** (`string`): An optional human readable name.

        **Returns:**
            * A new Session instance.

        **Raises:**
            * :class:`radical.pilot.DatabaseError`

        """

        # init the base class inits
        saga.Session.__init__(self)
        Object.__init__(self)

        # before doing anything else, set up the debug helper for the lifetime
        # of the session.
        self._debug_helper = ru.DebugHelper()

        # Dictionaries holding all manager objects created during the session.
        self._pilot_manager_objects = list()
        self._unit_manager_objects = list()

        # Create a new process registry. All objects belonging to this
        # session will register their worker processes (if they have any)
        # in this registry. This makes it easier to shut down things in
        # a more coordinate fashion.
        self._process_registry = _ProcessRegistry()

        # The resource configuration dictionary associated with the session.
        self._resource_configs = {}

        self._database_url = database_url
        self._database_name = database_name

        if not self._database_url:
            self._database_url = os.getenv("RADICAL_PILOT_DBURL", None)

        if not self._database_url:
            raise PilotException("no database URL (set RADICAL_PILOT_DBURL)")

        logger.info("using database url  %s" % self._database_url)

        # if the database url contains a path element, we interpret that as
        # database name (without the leading slash)
        tmp_url = ru.Url(self._database_url)
        if  tmp_url.path            and \
            tmp_url.path[0]  == '/' and \
            len(tmp_url.path) >  1  :
            self._database_name = tmp_url.path[1:]
            logger.info("using database path %s" % self._database_name)
        else:
            logger.info("using database name %s" % self._database_name)

        # Loading all "default" resource configurations
        module_path = os.path.dirname(os.path.abspath(__file__))
        default_cfgs = "%s/configs/*.json" % module_path
        config_files = glob.glob(default_cfgs)

        for config_file in config_files:

            try:
                rcs = ResourceConfig.from_file(config_file)
            except Exception as e:
                logger.error("skip config file %s: %s" % (config_file, e))
                continue

            for rc in rcs:
                logger.info("Loaded resource configurations for %s" % rc)
                self._resource_configs[rc] = rcs[rc].as_dict()

        user_cfgs = "%s/.radical/pilot/configs/*.json" % os.environ.get('HOME')
        config_files = glob.glob(user_cfgs)

        for config_file in config_files:

            try:
                rcs = ResourceConfig.from_file(config_file)
            except Exception as e:
                logger.error("skip config file %s: %s" % (config_file, e))
                continue

            for rc in rcs:
                logger.info("Loaded resource configurations for %s" % rc)

                if rc in self._resource_configs:
                    # config exists -- merge user config into it
                    ru.dict_merge(self._resource_configs[rc],
                                  rcs[rc].as_dict(),
                                  policy='overwrite')
                else:
                    # new config -- add as is
                    self._resource_configs[rc] = rcs[rc].as_dict()

        default_aliases = "%s/configs/aliases.json" % module_path
        self._resource_aliases = ru.read_json_str(default_aliases)['aliases']

        ##########################
        ## CREATE A NEW SESSION ##
        ##########################
        if uid is None:
            try:
                self._connected = None

                if name:
                    self._name = name
                    self._uid = name
                # self._uid  = ru.generate_id ('rp.session.'+name+'.%(item_counter)06d', mode=ru.ID_CUSTOM)
                else:
                    self._uid = ru.generate_id('rp.session',
                                               mode=ru.ID_PRIVATE)
                    self._name = self._uid


                self._dbs, self._created, self._connection_info = \
                        dbSession.new(sid     = self._uid,
                                      name    = self._name,
                                      db_url  = self._database_url,
                                      db_name = database_name)

                logger.info("New Session created%s." % str(self))

            except Exception, ex:
                logger.exception('session create failed')
                raise PilotException("Couldn't create new session (database URL '%s' incorrect?): %s" \
                                % (self._database_url, ex))

Example #10

Show file

    def _pilot_state_callback (self, pilot, state) :
        
        try :

            with self.lock :

                pid = pilot.uid
    
                if  not pid in self.pilots :
                    # as we cannot unregister callbacks, we simply ignore this
                    # invokation.  Its probably from a pilot we used previously.
                    logger.warn ("[SchedulerCallback]: ComputePilot %s changed to %s (ignored)" % (pid, state))
                    return
    
    
                self.pilots[pid]['state'] = state
                logger.debug ("[SchedulerCallback]: ComputePilot %s changed to %s" % (pid, state))
    
                if  state in [ACTIVE] :
                    # the pilot is now ready to be used
                    self._reschedule (target_pid=pid)
    
                if  state in [DONE, FAILED, CANCELED] :

                  # self._dump ('pilot is final')

                    # If the pilot state is 'DONE', 'FAILED' or 'CANCELED', we
                    # need to reschedule the units which are reschedulable --
                    # all others are marked 'FAILED' if they are already
                    # 'EXECUTING' and not restartable
                    timestamp = datetime.datetime.utcnow()
                    self._db.change_compute_units (
                        filter_dict = {"pilot"       : pid, 
                                       "state"       : {"$in": [UNSCHEDULED,
                                                                PENDING_INPUT_STAGING, 
                                                                STAGING_INPUT, 
                                                                PENDING_EXECUTION, 
                                                                SCHEDULING]}},
                        set_dict    = {"state"       : UNSCHEDULED, 
                                       "pilot"       : None},
                        push_dict   = {"statehistory": {"state"     : UNSCHEDULED, 
                                                        "timestamp" : timestamp}, 
                                       "log"         : {"message"   :  "reschedule unit", 
                                                        "timestamp" : timestamp}
                                      })

                    self._db.change_compute_units (
                        filter_dict = {"pilot"       : pid, 
                                       "restartable" : True, 
                                       "state"       : {"$in": [EXECUTING, 
                                                                PENDING_OUTPUT_STAGING, 
                                                                STAGING_OUTPUT]}},
                        set_dict    = {"state"       : UNSCHEDULED,
                                       "pilot"       : None},
                        push_dict   = {"statehistory": {"state"     : UNSCHEDULED,
                                                        "timestamp" : timestamp}, 
                                       "log"         : {"message"   :  "reschedule unit", 
                                                        "timestamp" : timestamp}
                                      })

                    self._db.change_compute_units (
                        filter_dict = {"pilot"       : pid, 
                                       "restartable" : False, 
                                       "state"       : {"$in": [EXECUTING, 
                                                                PENDING_OUTPUT_STAGING, 
                                                                STAGING_OUTPUT]}},
                        set_dict    = {"state"       : FAILED},
                        push_dict   = {"statehistory": {"state"     : FAILED, 
                                                        "timestamp" : timestamp}, 
                                       "log"         : {"message"   :  "reschedule unit", 
                                                        "timestamp" : timestamp}
                                      })

                        # make sure that restartable units got back into the
                        # wait queue
                        #
                        # FIXME AM: f*****g state management: I don't have the
                        # unit state!  New state was just pushed to the DB, but
                        # I have actually no idea for which units, and the state
                        # known to the worker (i.e. the cached state) is most
                        # likely outdated. 
                        #
                        # So we don't handle runq/waitq here.  Instead, we rely
                        # on the unit cb to get invoked as soon as the state
                        # propagated back to us, and then remove them from the
                        # runq.  This is slow, potentially very slow, but save.
                        

                    # we can't use this pilot anymore...  
                    del self.pilots[pid]
                    # FIXME: how can I *un*register a pilot callback?
                    
    
        except Exception as e :
          # import traceback
          # traceback.print_exc ()
            logger.exception ("error in pilot callback for backfiller (%s) - ignored" % e)
            raise

Example #11

Show file

File: session.py Project: mingtaiha/radical.pilot

    def __init__ (self, database_url=None, database_name="radicalpilot",
                  uid=None, name=None):
        """Creates a new or reconnects to an exising session.

        If called without a uid, a new Session instance is created and 
        stored in the database. If uid is set, an existing session is 
        retrieved from the database. 

        **Arguments:**
            * **database_url** (`string`): The MongoDB URL.  If none is given,
              RP uses the environment variable RADICAL_PILOT_DBURL.  If that is
              not set, an error will be raises.

            * **database_name** (`string`): An alternative database name 
              (default: 'radicalpilot').

            * **uid** (`string`): If uid is set, we try 
              re-connect to an existing session instead of creating a new one.

            * **name** (`string`): An optional human readable name.

        **Returns:**
            * A new Session instance.

        **Raises:**
            * :class:`radical.pilot.DatabaseError`

        """

        # init the base class inits
        saga.Session.__init__ (self)
        Object.__init__ (self)

        # before doing anything else, set up the debug helper for the lifetime
        # of the session.
        self._debug_helper = ru.DebugHelper ()

        # Dictionaries holding all manager objects created during the session.
        self._pilot_manager_objects = list()
        self._unit_manager_objects = list()

        # Create a new process registry. All objects belonging to this 
        # session will register their worker processes (if they have any)
        # in this registry. This makes it easier to shut down things in 
        # a more coordinate fashion. 
        self._process_registry = _ProcessRegistry()

        # The resource configuration dictionary associated with the session.
        self._resource_configs = {}

        self._database_url  = database_url
        self._database_name = database_name 

        if  not self._database_url :
            self._database_url = os.getenv ("RADICAL_PILOT_DBURL", None)

        if  not self._database_url :
            raise PilotException ("no database URL (set RADICAL_PILOT_DBURL)")  

        logger.info("using database url  %s" % self._database_url)

        # if the database url contains a path element, we interpret that as
        # database name (without the leading slash)
        tmp_url = ru.Url (self._database_url)
        if  tmp_url.path            and \
            tmp_url.path[0]  == '/' and \
            len(tmp_url.path) >  1  :
            self._database_name = tmp_url.path[1:]
            logger.info("using database path %s" % self._database_name)
        else :
            logger.info("using database name %s" % self._database_name)

        # Loading all "default" resource configurations
        module_path   = os.path.dirname(os.path.abspath(__file__))
        default_cfgs  = "%s/configs/*.json" % module_path
        config_files  = glob.glob(default_cfgs)

        for config_file in config_files:

            try :
                rcs = ResourceConfig.from_file(config_file)
            except Exception as e :
                logger.error ("skip config file %s: %s" % (config_file, e))
                continue

            for rc in rcs:
                logger.info("Loaded resource configurations for %s" % rc)
                self._resource_configs[rc] = rcs[rc].as_dict() 

        user_cfgs     = "%s/.radical/pilot/configs/*.json" % os.environ.get ('HOME')
        config_files  = glob.glob(user_cfgs)

        for config_file in config_files:

            try :
                rcs = ResourceConfig.from_file(config_file)
            except Exception as e :
                logger.error ("skip config file %s: %s" % (config_file, e))
                continue

            for rc in rcs:
                logger.info("Loaded resource configurations for %s" % rc)

                if  rc in self._resource_configs :
                    # config exists -- merge user config into it
                    ru.dict_merge (self._resource_configs[rc],
                                   rcs[rc].as_dict(),
                                   policy='overwrite')
                else :
                    # new config -- add as is
                    self._resource_configs[rc] = rcs[rc].as_dict() 

        default_aliases = "%s/configs/aliases.json" % module_path
        self._resource_aliases = ru.read_json_str (default_aliases)['aliases']

        ##########################
        ## CREATE A NEW SESSION ##
        ##########################
        if uid is None:
            try:
                self._connected  = None

                if name :
                    self._name = name
                    self._uid  = name
                  # self._uid  = ru.generate_id ('rp.session.'+name+'.%(item_counter)06d', mode=ru.ID_CUSTOM)
                else :
                    self._uid  = ru.generate_id ('rp.session', mode=ru.ID_PRIVATE)
                    self._name = self._uid


                self._dbs, self._created, self._connection_info = \
                        dbSession.new(sid     = self._uid,
                                      name    = self._name,
                                      db_url  = self._database_url,
                                      db_name = database_name)

                logger.info("New Session created%s." % str(self))

            except Exception, ex:
                logger.exception ('session create failed')
                raise PilotException("Couldn't create new session (database URL '%s' incorrect?): %s" \
                                % (self._database_url, ex))

Example #12

Show file

File: unit_manager_controller.py Project: mingtaiha/radical.pilot

    def schedule_compute_units(self, pilot_uid, units):
        """Request the scheduling of one or more ComputeUnits on a
           ComputePilot.
        """

        try:
            cu_transfer   = list()
            cu_notransfer = list()

            # Get some information about the pilot sandbox from the database.
            pilot_info = self._db.get_pilots(pilot_ids=pilot_uid)
            # TODO: this hack below relies on what?! That there is just one pilot?
            pilot_sandbox = pilot_info[0]['sandbox']

            # Split units into two different lists: the first list contains the CUs
            # that need file transfer and the second list contains the CUs that
            # don't. The latter is added to the pilot directly, while the former
            # is added to the transfer queue.
            for unit in units:

                # Create object for staging status tracking
                unit.FTW_Input_Status = None
                unit.FTW_Input_Directives = []
                unit.Agent_Input_Status = None
                unit.Agent_Input_Directives = []
                unit.FTW_Output_Status = None
                unit.FTW_Output_Directives = []
                unit.Agent_Output_Status = None
                unit.Agent_Output_Directives = []

                # Split the input staging directives over the transfer worker and the agent
                input_sds = unit.description.input_staging
                if not isinstance(input_sds, list):
                    # Ugly, but is a workaround for iterating on attribute interface
                    # TODO: Verify if this piece of code is actually still required
                    if input_sds:
                        input_sds = [input_sds]
                    else:
                        input_sds = []

                for input_sd_entry in input_sds:
                    action = input_sd_entry['action']
                    source = Url(input_sd_entry['source'])
                    target = Url(input_sd_entry['target'])

                    new_sd = {'action':   action,
                              'source':   str(source),
                              'target':   str(target),
                              'flags':    input_sd_entry['flags'],
                              'priority': input_sd_entry['priority'],
                              'state':    PENDING
                    }

                    if action in [LINK, COPY, MOVE]:
                        unit.Agent_Input_Directives.append(new_sd)
                        unit.Agent_Input_Status = PENDING
                    elif action in [TRANSFER]:
                        if source.scheme and source.scheme != 'file':
                            # If there is a scheme and it is different than "file",
                            # assume a remote pull from the agent
                            unit.Agent_Input_Directives.append(new_sd)
                            unit.Agent_Input_Status = PENDING
                        else:
                            # Transfer from local to sandbox
                            unit.FTW_Input_Directives.append(new_sd)
                            unit.FTW_Input_Status = PENDING
                    else:
                        logger.warn('Not sure if action %s makes sense for input staging' % action)

                # Split the output staging directives over the transfer worker and the agent
                output_sds = unit.description.output_staging
                if not isinstance(output_sds, list):
                    # Ugly, but is a workaround for iterating on att iface
                    # TODO: Verify if this piece of code is actually still required
                    if output_sds:
                        output_sds = [output_sds]
                    else:
                        output_sds = []

                for output_sds_entry in output_sds:
                    action = output_sds_entry['action']
                    source = Url(output_sds_entry['source'])
                    target = Url(output_sds_entry['target'])

                    new_sd = {'action':   action,
                              'source':   str(source),
                              'target':   str(target),
                              'flags':    output_sds_entry['flags'],
                              'priority': output_sds_entry['priority'],
                              'state':    PENDING
                    }

                    if action == LINK or action == COPY or action == MOVE:
                        unit.Agent_Output_Directives.append(new_sd)
                        unit.Agent_Output_Status = NEW
                    elif action == TRANSFER:
                        if target.scheme and target.scheme != 'file':
                            # If there is a scheme and it is different than "file",
                            # assume a remote push from the agent
                            unit.Agent_Output_Directives.append(new_sd)
                            unit.Agent_Output_Status = NEW
                        else:
                            # Transfer from sandbox back to local
                            unit.FTW_Output_Directives.append(new_sd)
                            unit.FTW_Output_Status = NEW
                    else:
                        logger.warn('Not sure if action %s makes sense for output staging' % action)

                if unit.FTW_Input_Directives or unit.Agent_Input_Directives:
                    log = "Scheduled for data transfer to ComputePilot %s." % pilot_uid
                    self._db.set_compute_unit_state(unit.uid, PENDING_INPUT_STAGING, log)
                    cu_transfer.append(unit)
                else:
                    cu_notransfer.append(unit)

            # Bulk-add all non-transfer units-
            self._db.assign_compute_units_to_pilot(
                units=cu_notransfer,
                pilot_uid=pilot_uid,
                pilot_sandbox=pilot_sandbox
            )

            self._db.assign_compute_units_to_pilot(
                units=cu_transfer,
                pilot_uid=pilot_uid,
                pilot_sandbox=pilot_sandbox
            )

            for unit in cu_notransfer:
                log = "Scheduled for execution on ComputePilot %s." % pilot_uid
                self._db.set_compute_unit_state(unit.uid, PENDING_EXECUTION, log)
                #self._set_state(uid, PENDING_EXECUTION, log)

            logger.info(
                "Scheduled ComputeUnits %s for execution on ComputePilot '%s'." %
                (cu_notransfer, pilot_uid)
            )
        except Exception, e:
            logger.exception ('error in unit manager controller (schedule())')
            raise

Example #13

Show file

File: input_file_transfer_worker.py Project: JensTimmerman/radical.pilot

    def run(self):
        """Starts the process when Process.start() is called.
        """

        # make sure to catch sys.exit (which raises SystemExit)
        try :

            logger.info("Starting InputFileTransferWorker")

            # Try to connect to the database and create a tailable cursor.
            try:
                connection = self.db_connection_info.get_db_handle()
                db = connection[self.db_connection_info.dbname]
                um_col = db["%s.cu" % self.db_connection_info.session_id]
                logger.debug("Connected to MongoDB. Serving requests for UnitManager %s." % self.unit_manager_id)

            except Exception as e :
                logger.exception("Connection error: %s" % e)
                raise

            try :
                while not self._stop.is_set():
                    # See if we can find a ComputeUnit that is waiting for
                    # input file transfer.
                    compute_unit = None

                    ts = datetime.datetime.utcnow()
                    compute_unit = um_col.find_and_modify(
                        query={"unitmanager": self.unit_manager_id,
                               "FTW_Input_Status": PENDING},
                        update={"$set" : {"FTW_Input_Status": EXECUTING,
                                          "state": STAGING_INPUT},
                                "$push": {"statehistory": {"state": STAGING_INPUT, "timestamp": ts}}},
                        limit=BULK_LIMIT # TODO: bulklimit is probably not the best way to ensure there is just one
                    )
                    # FIXME: AM: find_and_modify is not bulkable!
                    state = STAGING_INPUT

                    if compute_unit is None:
                        # Sleep a bit if no new units are available.
                        time.sleep(IDLE_TIME) 

                    else:
                        compute_unit_id = None
                        try:
                            log_messages = []

                            # We have found a new CU. Now we can process the transfer
                            # directive(s) wit SAGA.
                            compute_unit_id = str(compute_unit["_id"])
                            remote_sandbox = compute_unit["sandbox"]
                            input_staging = compute_unit["FTW_Input_Directives"]

                            # We need to create the CU's directory in case it doesn't exist yet.
                            log_msg = "Creating ComputeUnit sandbox directory %s." % remote_sandbox
                            log_messages.append(log_msg)
                            logger.info(log_msg)

                            # Creating the sandbox directory.
                            try:
                                logger.debug ("saga.fs.Directory ('%s')" % remote_sandbox)

                                remote_sandbox_keyurl = saga.Url (remote_sandbox)
                                remote_sandbox_keyurl.path = '/'
                                remote_sandbox_key = str(remote_sandbox_keyurl)

                                if  remote_sandbox_key not in self._saga_dirs :
                                    self._saga_dirs[remote_sandbox_key] = \
                                            saga.filesystem.Directory (remote_sandbox_key,
                                                    flags=saga.filesystem.CREATE_PARENTS,
                                                    session=self._session)

                                saga_dir = self._saga_dirs[remote_sandbox_key]
                                saga_dir.make_dir (remote_sandbox, 
                                                   flags=saga.filesystem.CREATE_PARENTS)
                            except Exception as e :
                                logger.exception('Error: %s' % e)
                                # FIXME: why is this exception ignored?  AM


                            logger.info("Processing input file transfers for ComputeUnit %s" % compute_unit_id)
                            # Loop over all transfer directives and execute them.
                            for sd in input_staging:

                                state_doc = um_col.find_one(
                                    {"_id": compute_unit_id},
                                    fields=["state"]
                                )
                                if state_doc['state'] == CANCELED:
                                    logger.info("Compute Unit Canceled, interrupting input file transfers.")
                                    state = CANCELED
                                    break

                                abs_src = os.path.abspath(sd['source'])
                                input_file_url = saga.Url("file://localhost/%s" % abs_src)
                                if not sd['target']:
                                    target = remote_sandbox
                                else:
                                    target = "%s/%s" % (remote_sandbox, sd['target'])

                                log_msg = "Transferring input file %s -> %s" % (input_file_url, target)
                                log_messages.append(log_msg)
                                logger.debug(log_msg)

                                # Execute the transfer.
                                logger.debug ("saga.fs.File ('%s')" % input_file_url)
                                input_file = saga.filesystem.File(
                                    input_file_url,
                                    session=self._session
                                )

                                if CREATE_PARENTS in sd['flags']:
                                    copy_flags = saga.filesystem.CREATE_PARENTS
                                else:
                                    copy_flags = 0

                                try :
                                    input_file.copy(target, flags=copy_flags)
                                except Exception as e :
                                    logger.exception (e)
                                input_file.close()

                                # If all went fine, update the state of this StagingDirective to Done
                                um_col.find_and_modify(
                                    query={"_id" : compute_unit_id,
                                           'FTW_Input_Status': EXECUTING,
                                           'FTW_Input_Directives.state': PENDING,
                                           'FTW_Input_Directives.source': sd['source'],
                                           'FTW_Input_Directives.target': sd['target'],
                                           },
                                    update={'$set': {'FTW_Input_Directives.$.state': 'Done'},
                                            '$push': {'log': {
                                                'timestamp': datetime.datetime.utcnow(), 
                                                'message'  : log_msg}}
                                    }
                                )

                        except Exception as e :
                            # Update the CU's state 'FAILED'.
                            ts = datetime.datetime.utcnow()
                            logentry = {'message'  : "Input transfer failed: %s" % e,
                                        'timestamp': ts}

                            um_col.update({'_id': compute_unit_id}, {
                                '$set': {'state': FAILED},
                                '$push': {
                                    'statehistory': {'state': FAILED, 'timestamp': ts},
                                    'log': logentry
                                }
                            })

                            logger.exception(str(logentry))

                    # Code below is only to be run by the "first" or only worker
                    if self._worker_number > 1:
                        continue

                    # If the CU was canceled we can skip the remainder of this loop.
                    if state == CANCELED:
                        continue

                    #
                    # Check to see if there are more pending Directives, if not, we are Done
                    #
                    cursor_w = um_col.find({"unitmanager": self.unit_manager_id,
                                            "$or": [ {"Agent_Input_Status": EXECUTING},
                                                     {"FTW_Input_Status": EXECUTING}
                                                   ]
                                            }
                                           )
                    # Iterate over all the returned CUs (if any)
                    for cu in cursor_w:
                        # See if there are any FTW Input Directives still pending
                        if cu['FTW_Input_Status'] == EXECUTING and \
                                not any(d['state'] == EXECUTING or d['state'] == PENDING for d in cu['FTW_Input_Directives']):
                            # All Input Directives for this FTW are done, mark the CU accordingly
                            um_col.update({"_id": cu["_id"]},
                                          {'$set': {'FTW_Input_Status': DONE},
                                           '$push': {'log': {
                                                'timestamp': datetime.datetime.utcnow(),
                                                'message'  : 'All FTW Input Staging Directives done - %d.' % self._worker_number}}
                                           }
                            )

                        # See if there are any Agent Input Directives still pending or executing,
                        # if not, mark it DONE.
                        if cu['Agent_Input_Status'] == EXECUTING and \
                                not any(d['state'] == EXECUTING or d['state'] == PENDING for d in cu['Agent_Input_Directives']):
                            # All Input Directives for this Agent are done, mark the CU accordingly
                            um_col.update({"_id": cu["_id"]},
                                           {'$set': {'Agent_Input_Status': DONE},
                                            '$push': {'log': {
                                                'timestamp': datetime.datetime.utcnow(), 
                                                'message'  : 'All Agent Input Staging Directives done - %d.' % self._worker_number}}
                                           }
                            )

                    #
                    # Check for all CUs if both Agent and FTW staging is done, we can then mark the CU PendingExecution
                    #
                    ts = datetime.datetime.utcnow()
                    um_col.find_and_modify(
                        query={"unitmanager": self.unit_manager_id,
                               "Agent_Input_Status": { "$in": [ None, DONE ] },
                               "FTW_Input_Status": { "$in": [ None, DONE ] },
                               "state": STAGING_INPUT
                        },
                        update={"$set": {
                                    "state": PENDING_EXECUTION
                                },
                                "$push": {
                                    "statehistory": {"state": PENDING_EXECUTION, "timestamp": ts}
                                }
                        }
                    )

            except Exception as e :

                logger.exception("transfer worker error: %s" % e)
                self._session.close (cleanup=False)
                raise

        except SystemExit as e :
            logger.debug("input file transfer thread caught system exit -- forcing application shutdown")
            import thread
            thread.interrupt_main ()

Example #14

Show file

File: unit_manager.py Project: mingtaiha/radical.pilot

    def submit_units(self, unit_descriptions):
        """Submits on or more :class:`radical.pilot.ComputeUnit` instances to the
        unit manager.

        **Arguments:**

            * **unit_descriptions** [:class:`radical.pilot.ComputeUnitDescription`
              or list of :class:`radical.pilot.ComputeUnitDescription`]: The
              description of the compute unit instance(s) to create.

        **Returns:**

              * A list of :class:`radical.pilot.ComputeUnit` objects.

        **Raises:**

            * :class:`radical.pilot.PilotException`
        """

        if not self._uid:
            raise IncorrectState(msg="Invalid object instance.")

        return_list_type = True
        if not isinstance(unit_descriptions, list):
            return_list_type  = False
            unit_descriptions = [unit_descriptions]

        # we return a list of compute units
        ret = list()

        # the scheduler will return a dictionary of the form:
        #   {
        #     ud_1 : pilot_id_a,
        #     ud_2 : pilot_id_b
        #     ...
        #   }
        #
        # The scheduler may not be able to schedule some units - those will
        # have 'None' as pilot ID.

        units = list()
        for ud in unit_descriptions :

            units.append (ComputeUnit.create (unit_description=ud,
                                              unit_manager_obj=self, 
                                              local_state=SCHEDULING))

        self._worker.publish_compute_units (units=units)

        schedule = None
        try:
            schedule = self._scheduler.schedule (units=units)
       
        except Exception as e:
            logger.exception ("Internal error - unit scheduler failed")
            raise 

        self.handle_schedule (schedule)

        if  return_list_type :
            return units
        else :
            return units[0]

Example #15

Show file

    def run(self):
        """run() is called when the process is started via
           PilotManagerController.start().
        """

        # make sure to catch sys.exit (which raises SystemExit)
        try :

            logger.debug("Worker thread (ID: %s[%s]) for PilotManager %s started." %
                        (self.name, self.ident, self._pm_id))

            while not self._stop.is_set():

                # # Check if one or more startup requests have finished.
                # self.startup_results_lock.acquire()

                # new_startup_results = list()

                # for transfer_result in self.startup_results:
                #     if transfer_result.ready():
                #         result = transfer_result.get()

                #         self._db.update_pilot_state(
                #             pilot_uid=result["pilot_uid"],
                #             state=result["state"],
                #             sagajobid=result["saga_job_id"],
                #             pilot_sandbox=result["sandbox"],
                #             global_sandbox=result["global_sandbox"],
                #             submitted=result["submitted"],
                #             logs=result["logs"]
                #         )

                #     else:
                #         new_startup_results.append(transfer_result)

                # self.startup_results = new_startup_results

                # self.startup_results_lock.release()

                # Check and update pilots. This needs to be optimized at
                # some point, i.e., state pulling should be conditional
                # or triggered by a tailable MongoDB cursor, etc.
                pilot_list = self._db.get_pilots(pilot_manager_id=self._pm_id)
                action = False

                for pilot in pilot_list:
                    pilot_id = str(pilot["_id"])

                    new_state = pilot["state"]
                    if pilot_id in self._shared_data:
                        old_state = self._shared_data[pilot_id]["data"]["state"]
                    else:
                        old_state = None
                        self._shared_data[pilot_id] = {
                            'data':          pilot,
                            'callbacks':     [],
                            'facade_object': None
                        }

                    self._shared_data[pilot_id]['data'] = pilot

                    # FIXME: *groan* what a hack...  The Canceling state is by
                    # the nature of it not recorded in the database, but only in
                    # the local cache.  So if we see it as old state, we have to
                    # avoid state transitions into non-final states in the cache
                    # at all cost -- so we catch this here specifically
                    no_cb = False
                    if  old_state == CANCELING :
                        if  new_state not in [DONE, FAILED, CANCELED] :
                            # restore old state, making the cache explicitly
                            # different than the DB recorded state
                            self._shared_data[pilot_id]["data"]["state"] = old_state 

                            # do not tr igger a state cb!
                            no_cb = True

                    if new_state != old_state :
                        action = True

                        if not no_cb :
                            # On a state change, we fire zee callbacks.
                            logger.info("ComputePilot '%s' state changed from '%s' to '%s'." \
                                            % (pilot_id, old_state, new_state))

                            # The state of the pilot has changed, We call all
                            # pilot-level callbacks to propagate this.  This also
                            # includes communication to the unit scheduler which
                            # may, or may not, cancel the pilot's units.
                            self.call_callbacks(pilot_id, new_state)

                    # If the state is 'DONE', 'FAILED' or 'CANCELED', we also
                    # set the state of the compute unit accordingly (but only
                    # for non-final units)
                    if new_state in [FAILED, DONE, CANCELED]:
                        unit_ids = self._db.pilot_list_compute_units(pilot_uid=pilot_id)
                        self._db.set_compute_unit_state (
                            unit_ids=unit_ids, 
                            state=CANCELED,
                            src_states=[ PENDING_INPUT_STAGING,
                                         STAGING_INPUT,
                                         PENDING_EXECUTION,
                                         SCHEDULING,
                                         EXECUTING,
                                         PENDING_OUTPUT_STAGING,
                                         STAGING_OUTPUT
                                       ],
                            log="Pilot '%s' has terminated with state '%s'. CU canceled." % (pilot_id, new_state))

                # After the first iteration, we are officially initialized!
                if not self._initialized.is_set():
                    self._initialized.set()

                # sleep a little if this cycle was idle
                if  not action :
                    time.sleep(IDLE_TIME)

        except SystemExit as e :
            logger.exception ("pilot manager controller thread caught system exit -- forcing application shutdown")
            import thread
            thread.interrupt_main ()

        finally :
            # shut down the autonomous pilot launcher worker(s)
            for worker in self._pilot_launcher_worker_pool:
                logger.debug("pworker %s stops   launcher %s" % (self.name, worker.name))
                worker.stop ()
                logger.debug("pworker %s stopped launcher %s" % (self.name, worker.name))

Example #16

Show file

File: pilot_launcher_worker.py Project: JensTimmerman/radical.pilot

    def run(self):
        """Starts the process when Process.start() is called.
        """

        # make sure to catch sys.exit (which raises SystemExit)
        try:
            # Get directory where this module lives
            mod_dir = os.path.dirname(os.path.realpath(__file__))

            # Try to connect to the database
            try:
                connection = self.db_connection_info.get_db_handle()
                db = connection[self.db_connection_info.dbname]
                pilot_col = db["%s.p" % self.db_connection_info.session_id]
                logger.debug(
                    "Connected to MongoDB. Serving requests for PilotManager %s."
                    % self.pilot_manager_id)

            except Exception as e:
                logger.exception("Connection error: %s" % e)
                return

            last_job_check = time.time()

            while not self._stop.is_set():

                # Periodically, we pull up all ComputePilots that are pending
                # execution or were last seen executing and check if the corresponding
                # SAGA job is still pending in the queue. If that is not the case,
                # we assume that the job has failed for some reasons and update
                # the state of the ComputePilot accordingly.
                if last_job_check + JOB_CHECK_INTERVAL < time.time():
                    last_job_check = time.time()
                    self.check_pilot_states(pilot_col)

                # See if we can find a ComputePilot that is waiting to be launched.
                # If we find one, we use SAGA to create a job service, a job
                # description and a job that is then send to the local or remote
                # queueing system. If this succedes, we set the ComputePilot's
                # state to pending, otherwise to failed.
                compute_pilot = None

                ts = datetime.datetime.utcnow()
                compute_pilot = pilot_col.find_and_modify(
                    query={
                        "pilotmanager": self.pilot_manager_id,
                        "state": PENDING_LAUNCH
                    },
                    update={
                        "$set": {
                            "state": LAUNCHING
                        },
                        "$push": {
                            "statehistory": {
                                "state": LAUNCHING,
                                "timestamp": ts
                            }
                        }
                    })

                if not compute_pilot:
                    time.sleep(IDLE_TIMER)

                else:
                    try:
                        # ------------------------------------------------------
                        #
                        # LAUNCH THE PILOT AGENT VIA SAGA
                        #
                        logentries = []
                        pilot_id = str(compute_pilot["_id"])

                        logger.info("Launching ComputePilot %s" % pilot_id)

                        # ------------------------------------------------------
                        # Database connection parameters
                        session_uid = self.db_connection_info.session_id
                        database_url = self.db_connection_info.dburl
                        database_name = self.db_connection_info.dbname
                        database_auth = self.db_connection_info.dbauth

                        # ------------------------------------------------------
                        # pilot description and resource configuration
                        number_cores = compute_pilot['description']['cores']
                        runtime = compute_pilot['description']['runtime']
                        queue = compute_pilot['description']['queue']
                        project = compute_pilot['description']['project']
                        cleanup = compute_pilot['description']['cleanup']
                        resource_key = compute_pilot['description']['resource']
                        schema = compute_pilot['description']['access_schema']
                        memory = compute_pilot['description']['memory']
                        pilot_sandbox = compute_pilot['sandbox']
                        global_sandbox = compute_pilot['global_sandbox']

                        # we expand and exchange keys in the resource config,
                        # depending on the selected schema so better use a deep
                        # copy..
                        resource_cfg = self._session.get_resource_config(
                            resource_key, schema)

                        # import pprint
                        # pprint.pprint (resource_cfg)

                        # ------------------------------------------------------
                        # get parameters from cfg, set defaults where needed
                        agent_mongodb_endpoint = resource_cfg.get(
                            'agent_mongodb_endpoint', database_url)
                        agent_spawner = resource_cfg.get(
                            'agent_spawner', DEFAULT_AGENT_SPAWNER)
                        agent_type = resource_cfg.get('agent_type',
                                                      DEFAULT_AGENT_TYPE)
                        agent_scheduler = resource_cfg.get('agent_scheduler')
                        tunnel_bind_device = resource_cfg.get(
                            'tunnel_bind_device')
                        default_queue = resource_cfg.get('default_queue')
                        forward_tunnel_endpoint = resource_cfg.get(
                            'forward_tunnel_endpoint')
                        js_endpoint = resource_cfg.get('job_manager_endpoint')
                        lrms = resource_cfg.get('lrms')
                        mpi_launch_method = resource_cfg.get(
                            'mpi_launch_method')
                        pre_bootstrap = resource_cfg.get('pre_bootstrap')
                        python_interpreter = resource_cfg.get(
                            'python_interpreter')
                        spmd_variation = resource_cfg.get('spmd_variation')
                        task_launch_method = resource_cfg.get(
                            'task_launch_method')
                        rp_version = resource_cfg.get('rp_version',
                                                      DEFAULT_RP_VERSION)
                        virtenv_mode = resource_cfg.get(
                            'virtenv_mode', DEFAULT_VIRTENV_MODE)
                        virtenv = resource_cfg.get('virtenv', DEFAULT_VIRTENV)
                        stage_cacerts = resource_cfg.get(
                            'stage_cacerts', 'False')

                        if stage_cacerts.lower() == 'true':
                            stage_cacerts = True
                        else:
                            stage_cacerts = False

                        # expand variables in virtenv string
                        virtenv = virtenv % {
                            'pilot_sandbox': saga.Url(pilot_sandbox).path,
                            'global_sandbox': saga.Url(global_sandbox).path
                        }

                        # Check for deprecated global_virtenv
                        global_virtenv = resource_cfg.get('global_virtenv')
                        if global_virtenv:
                            logger.warn(
                                "'global_virtenv' keyword is deprecated -- use 'virtenv' and 'virtenv_mode'"
                            )
                            virtenv = global_virtenv
                            virtenv_mode = 'use'

                        # set default scheme, host, port and dbname if not set
                        db_url = saga.Url(agent_mongodb_endpoint)
                        if not db_url.scheme: db_url.scheme = 'mongodb'
                        if not db_url.host: db_url.host = 'localhost'
                        if not db_url.port: db_url.port = 27017
                        if not database_name: database_name = 'radicalpilot'

                        # Create a host:port string for use by the bootstrapper.
                        database_hostport = "%s:%d" % (db_url.host,
                                                       db_url.port)

                        # ------------------------------------------------------
                        # Copy the bootstrap shell script.  This also creates
                        # the sandbox. We use always "default_bootstrapper.sh"
                        bootstrapper = 'default_bootstrapper.sh'
                        bootstrapper_path = os.path.abspath("%s/../bootstrapper/%s" \
                                % (mod_dir, bootstrapper))

                        msg = "Using bootstrapper %s" % bootstrapper_path
                        logentries.append(Logentry(msg, logger=logger.info))

                        bs_script_url = saga.Url("file://localhost/%s" %
                                                 bootstrapper_path)
                        bs_script_tgt = saga.Url("%s/pilot_bootstrapper.sh" %
                                                 pilot_sandbox)

                        msg = "Copying bootstrapper '%s' to agent sandbox (%s)." \
                                % (bs_script_url, bs_script_tgt)
                        logentries.append(Logentry(msg, logger=logger.debug))

                        bs_script = saga.filesystem.File(bs_script_url,
                                                         session=self._session)
                        bs_script.copy(bs_script_tgt,
                                       flags=saga.filesystem.CREATE_PARENTS)
                        bs_script.close()

                        # ------------------------------------------------------
                        # the version of the agent is derived from
                        # rp_version, which has the following format
                        # and interpretation:
                        #
                        # case rp_version:
                        #   @<token>:
                        #   @tag/@branch/@commit: # no sdist staging
                        #       git clone $github_base radical.pilot.src
                        #       (cd radical.pilot.src && git checkout token)
                        #       pip install -t $VIRTENV/rp_install/ radical.pilot.src
                        #       rm -rf radical.pilot.src
                        #       export PYTHONPATH=$VIRTENV/rp_install:$PYTHONPATH
                        #
                        #   release: # no sdist staging
                        #       pip install -t $VIRTENV/rp_install radical.pilot
                        #       export PYTHONPATH=$VIRTENV/rp_install:$PYTHONPATH
                        #
                        #   local: # needs sdist staging
                        #       tar zxf $sdist.tgz
                        #       pip install -t $VIRTENV/rp_install $sdist/
                        #       export PYTHONPATH=$VIRTENV/rp_install:$PYTHONPATH
                        #
                        #   debug: # needs sdist staging
                        #       tar zxf $sdist.tgz
                        #       pip install -t $SANDBOX/rp_install $sdist/
                        #       export PYTHONPATH=$SANDBOX/rp_install:$PYTHONPATH
                        #
                        #   installed: # no sdist staging
                        #       true
                        # esac
                        #
                        # virtenv_mode
                        #   private : error  if ve exists, otherwise create, then use
                        #   update  : update if ve exists, otherwise create, then use
                        #   create  : use    if ve exists, otherwise create, then use
                        #   use     : use    if ve exists, otherwise error,  then exit
                        #   recreate: delete if ve exists, otherwise create, then use
                        #
                        # examples   :
                        #   [email protected]
                        #   virtenv@devel
                        #   virtenv@release
                        #   virtenv@installed
                        #   stage@local
                        #   stage@/tmp/my_agent.py
                        #
                        # Note that some combinations may be invalid,
                        # specifically in the context of virtenv_mode.  If, for
                        # example, virtenv_mode is 'use', then the 'virtenv:tag'
                        # will not make sense, as the virtenv is not updated.
                        # In those cases, the virtenv_mode is honored, and
                        # a warning is printed.
                        #
                        # Also, the 'stage' mode can only be combined with the
                        # 'local' source, or with a path to the agent (relative
                        # to mod_dir, or absolute).
                        #
                        # A rp_version which does not adhere to the
                        # above syntax is ignored, and the fallback stage@local
                        # is used.

                        if  not rp_version.startswith('@') and \
                            not rp_version in ['installed', 'local', 'debug']:
                            raise ValueError("invalid rp_version '%s'" %
                                             rp_version)

                        stage_sdist = True
                        if rp_version in ['installed', 'release']:
                            stage_sdist = False

                        if rp_version.startswith('@'):
                            stage_sdist = False
                            rp_version = rp_version[1:]  # strip '@'

                        # ------------------------------------------------------
                        # Copy the rp sdist if needed.  We actually also stage
                        # the sdists for radical.utils and radical.saga, so that
                        # we have the complete stack to install...
                        if stage_sdist:

                            for path in [
                                    ru.sdist_path, saga.sdist_path, sdist_path
                            ]:

                                sdist_url = saga.Url("file://localhost/%s" %
                                                     path)
                                msg = "Copying sdist '%s' to sdist sandbox (%s)." % (
                                    sdist_url, pilot_sandbox)
                                logentries.append(
                                    Logentry(msg, logger=logger.debug))

                                sdist_file = saga.filesystem.File(sdist_url)
                                sdist_file.copy("%s/" % (str(pilot_sandbox)))
                                sdist_file.close()

                        # ------------------------------------------------------
                        # some machines cannot run pip due to outdated ca certs.
                        # For those, we also stage an updated cert bundle
                        if stage_cacerts:
                            cc_path = os.path.abspath("%s/../bootstrapper/%s" \
                                    % (mod_dir, 'cacert.pem.gz'))

                            cc_script_url = saga.Url("file://localhost/%s" %
                                                     cc_path)
                            cc_script_tgt = saga.Url("%s/cacert.pem.gz" %
                                                     pilot_sandbox)

                            cc_script = saga.filesystem.File(
                                cc_script_url, session=self._session)
                            cc_script.copy(
                                cc_script_tgt,
                                flags=saga.filesystem.CREATE_PARENTS)
                            cc_script.close()

                        # ------------------------------------------------------
                        # sanity checks
                        if not agent_spawner:
                            raise RuntimeError("missing agent spawner")
                        if not agent_scheduler:
                            raise RuntimeError("missing agent scheduler")
                        if not lrms: raise RuntimeError("missing LRMS")
                        if not mpi_launch_method:
                            raise RuntimeError("missing mpi launch method")
                        if not task_launch_method:
                            raise RuntimeError("missing task launch method")

                        # massage some values
                        debug_level = os.environ.get(
                            'RADICAL_PILOT_AGENT_VERBOSE', logger.level)
                        try:
                            debug_level = int(debug_level)
                        except ValueError:
                            debug_level = {
                                'CRITICAL': 1,
                                'ERROR': 2,
                                'WARNING': 3,
                                'WARN': 3,
                                'INFO': 4,
                                'DEBUG': 5
                            }.get(debug_level, 0)

                        if not queue:
                            queue = default_queue

                        if cleanup and isinstance(cleanup, bool):
                            cleanup = 'luve'  #  l : log files
                            #  u : unit work dirs
                            #  v : virtualenv
                            #  e : everything (== pilot sandbox)
                            #
                            # we never cleanup virtenvs which are not private
                            if virtenv_mode is not 'private':
                                cleanup = cleanup.replace('v', '')

                        sdists = ':'.join(
                            [ru.sdist_name, saga.sdist_name, sdist_name])

                        # set mandatory args
                        bootstrap_args = ""
                        bootstrap_args += " -b '%s'" % sdists
                        bootstrap_args += " -c '%s'" % number_cores
                        bootstrap_args += " -d '%s'" % debug_level
                        bootstrap_args += " -g '%s'" % virtenv
                        bootstrap_args += " -j '%s'" % task_launch_method
                        bootstrap_args += " -k '%s'" % mpi_launch_method
                        bootstrap_args += " -l '%s'" % lrms
                        bootstrap_args += " -m '%s'" % database_hostport
                        bootstrap_args += " -n '%s'" % database_name
                        bootstrap_args += " -o '%s'" % agent_spawner
                        bootstrap_args += " -p '%s'" % pilot_id
                        bootstrap_args += " -q '%s'" % agent_scheduler
                        bootstrap_args += " -r '%s'" % runtime
                        bootstrap_args += " -s '%s'" % session_uid
                        bootstrap_args += " -t '%s'" % agent_type
                        bootstrap_args += " -u '%s'" % virtenv_mode
                        bootstrap_args += " -v '%s'" % rp_version

                        # set optional args
                        if database_auth:
                            bootstrap_args += " -a '%s'" % database_auth
                        if tunnel_bind_device:
                            bootstrap_args += " -D '%s'" % tunnel_bind_device
                        if pre_bootstrap:
                            bootstrap_args += " -e '%s'" % "' -e '".join(
                                pre_bootstrap)
                        if forward_tunnel_endpoint:
                            bootstrap_args += " -f '%s'" % forward_tunnel_endpoint
                        if python_interpreter:
                            bootstrap_args += " -i '%s'" % python_interpreter
                        if cleanup:
                            bootstrap_args += " -x '%s'" % cleanup

                        # ------------------------------------------------------
                        # now that the script is in place and we know where it is,
                        # we can launch the agent
                        js_url = saga.Url(js_endpoint)
                        logger.debug("saga.job.Service ('%s')" % js_url)
                        if js_url in self._shared_worker_data['job_services']:
                            js = self._shared_worker_data['job_services'][
                                js_url]
                        else:
                            js = saga.job.Service(js_url,
                                                  session=self._session)
                            self._shared_worker_data['job_services'][
                                js_url] = js

                        # ------------------------------------------------------
                        # Create SAGA Job description and submit the pilot job

                        jd = saga.job.Description()

                        jd.executable = "/bin/bash"
                        jd.arguments = [
                            "-l pilot_bootstrapper.sh", bootstrap_args
                        ]
                        jd.working_directory = saga.Url(pilot_sandbox).path
                        jd.project = project
                        jd.output = "agent.out"
                        jd.error = "agent.err"
                        jd.total_cpu_count = number_cores
                        jd.wall_time_limit = runtime
                        jd.total_physical_memory = memory
                        jd.queue = queue

                        # Set the SPMD variation only if required
                        if spmd_variation:
                            jd.spmd_variation = spmd_variation

                        if 'RADICAL_PILOT_PROFILE' in os.environ:
                            jd.environment = {'RADICAL_PILOT_PROFILE': 'TRUE'}

                        logger.debug("Bootstrap command line: %s %s" %
                                     (jd.executable, jd.arguments))

                        msg = "Submitting SAGA job with description: %s" % str(
                            jd.as_dict())
                        logentries.append(Logentry(msg, logger=logger.debug))

                        pilotjob = js.create_job(jd)
                        pilotjob.run()

                        # do a quick error check
                        if pilotjob.state == saga.FAILED:
                            raise RuntimeError("SAGA Job state is FAILED.")

                        saga_job_id = pilotjob.id
                        self._shared_worker_data['job_ids'][pilot_id] = [
                            saga_job_id, js_url
                        ]

                        msg = "SAGA job submitted with job id %s" % str(
                            saga_job_id)
                        logentries.append(Logentry(msg, logger=logger.debug))

                        #
                        # ------------------------------------------------------

                        log_dicts = list()
                        for le in logentries:
                            log_dicts.append(le.as_dict())

                        # Update the Pilot's state to 'PENDING_ACTIVE' if SAGA job submission was successful.
                        ts = datetime.datetime.utcnow()
                        ret = pilot_col.update(
                            {
                                "_id": pilot_id,
                                "state": 'Launching'
                            }, {
                                "$set": {
                                    "state": PENDING_ACTIVE,
                                    "saga_job_id": saga_job_id
                                },
                                "$push": {
                                    "statehistory": {
                                        "state": PENDING_ACTIVE,
                                        "timestamp": ts
                                    }
                                },
                                "$pushAll": {
                                    "log": log_dicts
                                }
                            })

                        if ret['n'] == 0:
                            # could not update, probably because the agent is
                            # running already.  Just update state history and
                            # jobid then
                            # FIXME: make sure of the agent state!
                            ret = pilot_col.update({"_id": pilot_id}, {
                                "$set": {
                                    "saga_job_id": saga_job_id
                                },
                                "$push": {
                                    "statehistory": {
                                        "state": PENDING_ACTIVE,
                                        "timestamp": ts
                                    }
                                },
                                "$pushAll": {
                                    "log": log_dicts
                                }
                            })

                    except Exception as e:
                        # Update the Pilot's state 'FAILED'.
                        out, err, log = self._get_pilot_logs(
                            pilot_col, pilot_id)
                        ts = datetime.datetime.utcnow()

                        # FIXME: we seem to be unable to bson/json handle saga
                        # log messages containing an '#'.  This shows up here.
                        # Until we find a clean workaround, make log shorter and
                        # rely on saga logging to reveal the problem.
                        msg = "Pilot launching failed! (%s)" % e
                        logentries.append(Logentry(msg))

                        log_dicts = list()
                        log_messages = list()
                        for le in logentries:
                            log_dicts.append(le.as_dict())
                            log_messages.append(le.message)

                        pilot_col.update(
                            {
                                "_id": pilot_id,
                                "state": {
                                    "$ne": FAILED
                                }
                            }, {
                                "$set": {
                                    "state": FAILED,
                                    "stdout": out,
                                    "stderr": err,
                                    "logfile": log
                                },
                                "$push": {
                                    "statehistory": {
                                        "state": FAILED,
                                        "timestamp": ts
                                    }
                                },
                                "$pushAll": {
                                    "log": log_dicts
                                }
                            })
                        logger.exception('\n'.join(log_messages))

        except SystemExit as e:
            logger.exception(
                "pilot launcher thread caught system exit -- forcing application shutdown"
            )
            import thread
            thread.interrupt_main()

Example #17

Show file

    def schedule_compute_units(self, pilot_uid, units):
        """Request the scheduling of one or more ComputeUnits on a
           ComputePilot.
        """

        try:
            cu_transfer = list()
            cu_notransfer = list()

            # Get some information about the pilot sandbox from the database.
            pilot_info = self._db.get_pilots(pilot_ids=pilot_uid)
            # TODO: this hack below relies on what?! That there is just one pilot?
            pilot_sandbox = pilot_info[0]['sandbox']

            # Split units into two different lists: the first list contains the CUs
            # that need file transfer and the second list contains the CUs that
            # don't. The latter is added to the pilot directly, while the former
            # is added to the transfer queue.
            for unit in units:

                # Create object for staging status tracking
                unit.FTW_Input_Status = None
                unit.FTW_Input_Directives = []
                unit.Agent_Input_Status = None
                unit.Agent_Input_Directives = []
                unit.FTW_Output_Status = None
                unit.FTW_Output_Directives = []
                unit.Agent_Output_Status = None
                unit.Agent_Output_Directives = []

                # Split the input staging directives over the transfer worker and the agent
                input_sds = unit.description.input_staging
                if not isinstance(input_sds, list):
                    # Ugly, but is a workaround for iterating on attribute interface
                    # TODO: Verify if this piece of code is actually still required
                    if input_sds:
                        input_sds = [input_sds]
                    else:
                        input_sds = []

                for input_sd_entry in input_sds:
                    action = input_sd_entry['action']
                    source = Url(input_sd_entry['source'])
                    target = Url(input_sd_entry['target'])

                    new_sd = {
                        'action': action,
                        'source': str(source),
                        'target': str(target),
                        'flags': input_sd_entry['flags'],
                        'priority': input_sd_entry['priority'],
                        'state': PENDING
                    }

                    if action in [LINK, COPY, MOVE]:
                        unit.Agent_Input_Directives.append(new_sd)
                        unit.Agent_Input_Status = PENDING
                    elif action in [TRANSFER]:
                        if source.scheme and source.scheme != 'file':
                            # If there is a scheme and it is different than "file",
                            # assume a remote pull from the agent
                            unit.Agent_Input_Directives.append(new_sd)
                            unit.Agent_Input_Status = PENDING
                        else:
                            # Transfer from local to sandbox
                            unit.FTW_Input_Directives.append(new_sd)
                            unit.FTW_Input_Status = PENDING
                    else:
                        logger.warn(
                            'Not sure if action %s makes sense for input staging'
                            % action)

                # Split the output staging directives over the transfer worker and the agent
                output_sds = unit.description.output_staging
                if not isinstance(output_sds, list):
                    # Ugly, but is a workaround for iterating on att iface
                    # TODO: Verify if this piece of code is actually still required
                    if output_sds:
                        output_sds = [output_sds]
                    else:
                        output_sds = []

                for output_sds_entry in output_sds:
                    action = output_sds_entry['action']
                    source = Url(output_sds_entry['source'])
                    target = Url(output_sds_entry['target'])

                    new_sd = {
                        'action': action,
                        'source': str(source),
                        'target': str(target),
                        'flags': output_sds_entry['flags'],
                        'priority': output_sds_entry['priority'],
                        'state': PENDING
                    }

                    if action == LINK or action == COPY or action == MOVE:
                        unit.Agent_Output_Directives.append(new_sd)
                        unit.Agent_Output_Status = NEW
                    elif action == TRANSFER:
                        if target.scheme and target.scheme != 'file':
                            # If there is a scheme and it is different than "file",
                            # assume a remote push from the agent
                            unit.Agent_Output_Directives.append(new_sd)
                            unit.Agent_Output_Status = NEW
                        else:
                            # Transfer from sandbox back to local
                            unit.FTW_Output_Directives.append(new_sd)
                            unit.FTW_Output_Status = NEW
                    else:
                        logger.warn(
                            'Not sure if action %s makes sense for output staging'
                            % action)

                if unit.FTW_Input_Directives or unit.Agent_Input_Directives:
                    log = "Scheduled for data transfer to ComputePilot %s." % pilot_uid
                    self._db.set_compute_unit_state(unit.uid,
                                                    PENDING_INPUT_STAGING, log)
                    cu_transfer.append(unit)
                else:
                    cu_notransfer.append(unit)

            # Bulk-add all non-transfer units-
            self._db.assign_compute_units_to_pilot(units=cu_notransfer,
                                                   pilot_uid=pilot_uid,
                                                   pilot_sandbox=pilot_sandbox)

            self._db.assign_compute_units_to_pilot(units=cu_transfer,
                                                   pilot_uid=pilot_uid,
                                                   pilot_sandbox=pilot_sandbox)

            for unit in cu_notransfer:
                log = "Scheduled for execution on ComputePilot %s." % pilot_uid
                self._db.set_compute_unit_state(unit.uid, PENDING_EXECUTION,
                                                log)
                #self._set_state(uid, PENDING_EXECUTION, log)

            logger.info(
                "Scheduled ComputeUnits %s for execution on ComputePilot '%s'."
                % (cu_notransfer, pilot_uid))
        except Exception, e:
            logger.exception('error in unit manager controller (schedule())')
            raise

Example #18

Show file

    def run(self):
        """run() is called when the process is started via
           PilotManagerController.start().
        """

        # make sure to catch sys.exit (which raises SystemExit)
        try:

            logger.debug(
                "Worker thread (ID: %s[%s]) for UnitManager %s started." %
                (self.name, self.ident, self._um_id))

            # transfer results contains the futures to the results of the
            # asynchronous transfer operations.
            transfer_results = list()

            while not self._stop.is_set():

                # =================================================================
                #
                # Check and update units. This needs to be optimized at
                # some point, i.e., state pulling should be conditional
                # or triggered by a tailable MongoDB cursor, etc.
                unit_list = self._db.get_compute_units(
                    unit_manager_id=self._um_id)
                action = False

                for unit in unit_list:
                    unit_id = str(unit["_id"])

                    new_state = unit["state"]
                    if unit_id in self._shared_data:
                        old_state = self._shared_data[unit_id]["data"]["state"]
                    else:
                        old_state = None
                        self._shared_data_lock.acquire()
                        self._shared_data[unit_id] = {
                            'data': unit,
                            'callbacks': [],
                            'facade_object': None
                        }
                        self._shared_data_lock.release()

                    self._shared_data_lock.acquire()
                    self._shared_data[unit_id]["data"] = unit
                    self._shared_data_lock.release()

                    if new_state != old_state:
                        # On a state change, we fire zee callbacks.
                        logger.info(
                            "RUN ComputeUnit '%s' state changed from '%s' to '%s'."
                            % (unit_id, old_state, new_state))

                        # The state of the unit has changed, We call all
                        # unit-level callbacks to propagate this.
                        self.call_unit_state_callbacks(unit_id, new_state)

                        action = True

                # After the first iteration, we are officially initialized!
                if not self._initialized.is_set():
                    self._initialized.set()

                # sleep a little if this cycle was idle
                if not action:
                    time.sleep(IDLE_TIME)

        except SystemExit as e:
            logger.exception(
                "unit manager controller thread caught system exit -- forcing application shutdown"
            )
            import thread
            thread.interrupt_main()

        finally:
            # shut down the autonomous input / output transfer worker(s)
            for worker in self._input_file_transfer_worker_pool:
                logger.debug("uworker %s stops   itransfer %s" %
                             (self.name, worker.name))
                worker.stop()
                logger.debug("uworker %s stopped itransfer %s" %
                             (self.name, worker.name))

            for worker in self._output_file_transfer_worker_pool:
                logger.debug("uworker %s stops   otransfer %s" %
                             (self.name, worker.name))
                worker.stop()
                logger.debug("uworker %s stopped otransfer %s" %
                             (self.name, worker.name))

Example #19

Show file

    def register_cancel_pilots_request(self, pilot_ids=None):
        """Registers one or more pilots for cancelation.
        """

        if pilot_ids is None:

            pilot_ids = list()

            for pilot in self._db.get_pilots(pilot_manager_id=self._pm_id) :
                pilot_ids.append (str(pilot["_id"]))


        self._db.send_command_to_pilot(COMMAND_CANCEL_PILOT, pilot_ids=pilot_ids)
        logger.info("Sent 'COMMAND_CANCEL_PILOT' command to pilots %s.", pilot_ids)

        # pilots which are in ACTIVE state should now have time to react on the
        # CANCEL command sent above.  Meanwhile, we'll cancel all pending
        # pilots.  If that is done, we wait a little, say 10 seconds, to give
        # the pilot time to pick up the request and shut down -- but if it does
        # not do that, it will get killed the hard way...
        delayed_cancel = list()

        for pilot_id in pilot_ids :
            if  pilot_id in self._shared_data :

                # read state fomr _shared_data only once, so that it does not
                # change under us...
                old_state = str(self._shared_data[pilot_id]["data"]["state"])

                logger.warn ("actively cancel pilot %s state: %s" % (pilot_id, old_state))
                if  old_state in [DONE, FAILED, CANCELED] :
                    logger.warn ("can't actively cancel pilot %s: already in final state" % pilot_id)

                elif old_state in [PENDING_LAUNCH, LAUNCHING, PENDING_ACTIVE] :
                    if pilot_id in self._shared_worker_data['job_ids'] :

                        try :
                            job_id, js_url = self._shared_worker_data['job_ids'][pilot_id]
                            self._shared_data[pilot_id]["data"]["state"] = CANCELING
                            logger.info ("actively cancel pilot %s (%s, %s)" % (pilot_id, job_id, js_url))

                            js = self._shared_worker_data['job_services'][js_url]
                            job = js.get_job (job_id)
                            job.cancel ()
                        except Exception as e :
                            logger.exception ('pilot cancelation failed')


                    else :
                        logger.warn ("can't actively cancel pilot %s: no job id known" % pilot_id)
                        logger.debug (pprint.pformat (self._shared_worker_data))

                else :
                    logger.debug ("delay to actively cancel pilot %s: state %s" % (pilot_id, old_state))
                    delayed_cancel.append (pilot_id)

            else :
                logger.warn  ("can't actively cancel pilot %s: unknown pilot" % pilot_id)
                logger.debug (pprint.pformat (self._shared_data))

        # now tend to all delayed cancellation requests (ie. active pilots) --
        # if there are any
        if  delayed_cancel :

            # grant some levay to the unruly children...
            time.sleep (10)

            for pilot_id in delayed_cancel :

                if pilot_id in self._shared_worker_data['job_ids'] :

                    try :
                        job_id, js_url = self._shared_worker_data['job_ids'][pilot_id]
                        logger.info ("actively cancel pilot %s (delayed) (%s, %s)" % (pilot_id, job_id, js_url))

                        js = self._shared_worker_data['job_services'][js_url]
                        job = js.get_job (job_id)
                        job.cancel ()
                    except Exception as e :
                        logger.warn ('delayed pilot cancelation failed. '
                                'This is not necessarily a problem.')

                else :
                    logger.warn ("can't actively cancel pilot %s: no job id known (delayed)" % pilot_id)
                    logger.debug (pprint.pformat (self._shared_worker_data))

Example #20

Show file

File: unit_manager_controller.py Project: mingtaiha/radical.pilot

    def run(self):
        """run() is called when the process is started via
           PilotManagerController.start().
        """

        # make sure to catch sys.exit (which raises SystemExit)
        try :

            logger.debug("Worker thread (ID: %s[%s]) for UnitManager %s started." %
                        (self.name, self.ident, self._um_id))

            # transfer results contains the futures to the results of the
            # asynchronous transfer operations.
            transfer_results = list()

            while not self._stop.is_set():

                # =================================================================
                #
                # Check and update units. This needs to be optimized at
                # some point, i.e., state pulling should be conditional
                # or triggered by a tailable MongoDB cursor, etc.
                unit_list = self._db.get_compute_units(unit_manager_id=self._um_id)
                action    = False

                for unit in unit_list:
                    unit_id = str(unit["_id"])

                    new_state = unit["state"]
                    if unit_id in self._shared_data:
                        old_state = self._shared_data[unit_id]["data"]["state"]
                    else:
                        old_state = None
                        self._shared_data_lock.acquire()
                        self._shared_data[unit_id] = {
                            'data':          unit,
                            'callbacks':     [],
                            'facade_object': None
                        }
                        self._shared_data_lock.release()

                    self._shared_data_lock.acquire()
                    self._shared_data[unit_id]["data"] = unit
                    self._shared_data_lock.release()

                    if new_state != old_state:
                        # On a state change, we fire zee callbacks.
                        logger.info("RUN ComputeUnit '%s' state changed from '%s' to '%s'." % (unit_id, old_state, new_state))

                        # The state of the unit has changed, We call all
                        # unit-level callbacks to propagate this.
                        self.call_unit_state_callbacks(unit_id, new_state)

                        action = True

                # After the first iteration, we are officially initialized!
                if not self._initialized.is_set():
                    self._initialized.set()

                # sleep a little if this cycle was idle
                if  not action :
                    time.sleep(IDLE_TIME)


        except SystemExit as e :
            logger.exception ("unit manager controller thread caught system exit -- forcing application shutdown")
            import thread
            thread.interrupt_main ()


        finally :
            # shut down the autonomous input / output transfer worker(s)
            for worker in self._input_file_transfer_worker_pool:
                logger.debug("uworker %s stops   itransfer %s" % (self.name, worker.name))
                worker.stop ()
                logger.debug("uworker %s stopped itransfer %s" % (self.name, worker.name))

            for worker in self._output_file_transfer_worker_pool:
                logger.debug("uworker %s stops   otransfer %s" % (self.name, worker.name))
                worker.stop ()
                logger.debug("uworker %s stopped otransfer %s" % (self.name, worker.name))

Example #21

Show file

    def run(self):
        """Starts the process when Process.start() is called.
        """

        # make sure to catch sys.exit (which raises SystemExit)
        try :

            # Try to connect to the database and create a tailable cursor.
            try:
                connection = self.db_connection_info.get_db_handle()
                db = connection[self.db_connection_info.dbname]
                um_col = db["%s.cu" % self.db_connection_info.session_id]
                logger.debug("Connected to MongoDB. Serving requests for UnitManager %s." % self.unit_manager_id)

            except Exception as e:
                logger.exception("Connection error: %s" % e)
                return

            while not self._stop.is_set():
                compute_unit = None

                # See if we can find a ComputeUnit that is waiting for
                # output file transfer.
                ts = datetime.datetime.utcnow()
                compute_unit = um_col.find_and_modify(
                    query={"unitmanager": self.unit_manager_id,
                           "FTW_Output_Status": PENDING},
                    update={"$set" : {"FTW_Output_Status": EXECUTING,
                                      "state": STAGING_OUTPUT},
                            "$push": {"statehistory": {"state": STAGING_OUTPUT, "timestamp": ts}}},
                    limit=BULK_LIMIT
                )
                # FIXME: AM: find_and_modify is not bulkable!
                state = STAGING_OUTPUT

                #logger.info("OFTW after finding pending cus")
                if compute_unit is None:
                    #logger.info("OFTW no cus, sleep")
                    # Sleep a bit if no new units are available.
                    time.sleep(IDLE_TIME)
                else:
                    logger.info("OFTW cu found, progressing ...")
                    compute_unit_id = None
                    try:
                        # We have found a new CU. Now we can process the transfer
                        # directive(s) wit SAGA.
                        compute_unit_id = str(compute_unit["_id"])
                        remote_sandbox = compute_unit["sandbox"]
                        staging_directives = compute_unit["FTW_Output_Directives"]

                        logger.info("Processing output file transfers for ComputeUnit %s" % compute_unit_id)
                        # Loop over all staging directives and execute them.
                        for sd in staging_directives:

                            # Check if there was a cancel request
                            state_doc = um_col.find_one(
                                {"_id": compute_unit_id},
                                fields=["state"]
                            )
                            if state_doc['state'] == CANCELED:
                                logger.info("Compute Unit Canceled, interrupting output file transfers.")
                                state = CANCELED
                                break

                            action = sd['action']
                            source = sd['source']
                            target = sd['target']
                            flags  = sd['flags']

                            # Mark the beginning of transfer this StagingDirective
                            um_col.find_and_modify(
                                query={"_id" : compute_unit_id,
                                       'FTW_Output_Status': EXECUTING,
                                       'FTW_Output_Directives.state': PENDING,
                                       'FTW_Output_Directives.source': sd['source'],
                                       'FTW_Output_Directives.target': sd['target'],
                                       },
                                update={'$set': {'FTW_Output_Directives.$.state': EXECUTING},
                                        '$push': {'log': {
                                            'timestamp': datetime.datetime.utcnow(),
                                            'message'  : 'Starting transfer of %s' % source}}
                                }
                            )

                            abs_source = "%s/%s" % (remote_sandbox, source)

                            if os.path.basename(target) == target:
                                abs_target = "file://localhost%s" % os.path.join(os.getcwd(), target)
                            else:
                                abs_target = "file://localhost%s" % os.path.abspath(target)

                            log_msg = "Transferring output file %s -> %s" % (abs_source, abs_target)
                            logger.debug(log_msg)

                            logger.debug ("saga.fs.File ('%s')" % saga.Url(abs_source))
                            output_file = saga.filesystem.File(saga.Url(abs_source),
                                session=self._session
                            )

                            if CREATE_PARENTS in flags:
                                copy_flags = saga.filesystem.CREATE_PARENTS
                            else:
                                copy_flags = 0
                            logger.debug ("saga.fs.File.copy ('%s')" % saga.Url(abs_target))
                            output_file.copy(saga.Url(abs_target), flags=copy_flags)
                            output_file.close()

                            # If all went fine, update the state of this StagingDirective to Done
                            um_col.find_and_modify(
                                query={"_id" : compute_unit_id,
                                       'FTW_Output_Status': EXECUTING,
                                       'FTW_Output_Directives.state': EXECUTING,
                                       'FTW_Output_Directives.source': sd['source'],
                                       'FTW_Output_Directives.target': sd['target'],
                                       },
                                update={'$set': {'FTW_Output_Directives.$.state': DONE},
                                        '$push': {'log': {
                                            'timestamp': datetime.datetime.utcnow(),
                                            'message'  : log_msg}}
                                }
                            )

                    except Exception as e :
                        # Update the CU's state to 'FAILED'.
                        ts = datetime.datetime.utcnow()
                        log_message = "Output transfer failed: %s" % e
                        # TODO: not only mark the CU as failed, but also the specific Directive
                        um_col.update({'_id': compute_unit_id}, {
                            '$set': {'state': FAILED},
                            '$push': {
                                'statehistory': {'state': FAILED, 'timestamp': ts},
                                'log': {'message': log_message, 'timestamp': ts}
                            }
                        })
                        logger.exception (log_message)


                # Code below is only to be run by the "first" or only worker
                if self._worker_number > 1:
                    continue

                # If the CU was canceled we can skip the remainder of this loop.
                if state == CANCELED:
                    continue

                #
                # Check to see if there are more active Directives, if not, we are Done
                #
                cursor_w = um_col.find({"unitmanager": self.unit_manager_id,
                                        "$or": [ {"Agent_Output_Status": EXECUTING},
                                                 {"FTW_Output_Status": EXECUTING}
                                        ]
                }
                )
                # Iterate over all the returned CUs (if any)
                for cu in cursor_w:
                    # See if there are any FTW Output Directives still pending
                    if cu['FTW_Output_Status'] == EXECUTING and \
                            not any(d['state'] == EXECUTING or d['state'] == PENDING for d in cu['FTW_Output_Directives']):
                        # All Output Directives for this FTW are done, mark the CU accordingly
                        um_col.update({"_id": cu["_id"]},
                                      {'$set': {'FTW_Output_Status': DONE},
                                       '$push': {'log': {
                                           'timestamp': datetime.datetime.utcnow(),
                                           'message'  : 'All FTW output staging directives done - %d.' % self._worker_number}}
                                       }
                        )

                    # See if there are any Agent Output Directives still pending
                    if cu['Agent_Output_Status'] == EXECUTING and \
                            not any(d['state'] == EXECUTING or d['state'] == PENDING for d in cu['Agent_Output_Directives']):
                        # All Output Directives for this Agent are done, mark the CU accordingly
                        um_col.update({"_id": cu["_id"]},
                                      {'$set': {'Agent_Output_Status': DONE},
                                       '$push': {'log': {
                                           'timestamp': datetime.datetime.utcnow(),
                                           'message'  : 'All Agent Output Staging Directives done-%d.' % self._worker_number}}
                                      }
                        )

                #
                # Check for all CUs if both Agent and FTW staging is done, we can then mark the CU Done
                #
                ts = datetime.datetime.utcnow()
                um_col.find_and_modify(
                    query={"unitmanager": self.unit_manager_id,
                           # TODO: Now that our state model is linear,
                           # we probably don't need to check Agent_Output_Status anymore.
                           # Given that it is not updates by the agent currently, disable it here.
                           #"Agent_Output_Status": { "$in": [ None, DONE ] },
                           "FTW_Output_Status": { "$in": [ None, DONE ] },
                           "state": STAGING_OUTPUT
                    },
                    update={"$set": {
                        "state": DONE
                    },
                            "$push": {
                                "statehistory": {"state": DONE, "timestamp": ts}
                            }
                    }
                )

        except SystemExit as e :
            logger.exception("output file transfer thread caught system exit -- forcing application shutdown")
            import thread
            thread.interrupt_main ()

Example #22

Show file

File: pilot_launcher_worker.py Project: mingtaiha/radical.pilot

    def run(self):
        """Starts the process when Process.start() is called.
        """

        # make sure to catch sys.exit (which raises SystemExit)
        try:
            # Get directory where this module lives
            mod_dir = os.path.dirname(os.path.realpath(__file__))

            # Try to connect to the database
            try:
                connection = self.db_connection_info.get_db_handle()
                db = connection[self.db_connection_info.dbname]
                pilot_col = db["%s.p" % self.db_connection_info.session_id]
                logger.debug("Connected to MongoDB. Serving requests for PilotManager %s." % self.pilot_manager_id)

            except Exception as e:
                logger.exception("Connection error: %s" % e)
                return

            last_job_check = time.time()

            while not self._stop.is_set():

                # Periodically, we pull up all ComputePilots that are pending
                # execution or were last seen executing and check if the corresponding
                # SAGA job is still pending in the queue. If that is not the case,
                # we assume that the job has failed for some reasons and update
                # the state of the ComputePilot accordingly.
                if last_job_check + JOB_CHECK_INTERVAL < time.time():
                    last_job_check = time.time()
                    self.check_pilot_states(pilot_col)

                # See if we can find a ComputePilot that is waiting to be launched.
                # If we find one, we use SAGA to create a job service, a job
                # description and a job that is then send to the local or remote
                # queueing system. If this succedes, we set the ComputePilot's
                # state to pending, otherwise to failed.
                compute_pilot = None

                ts = datetime.datetime.utcnow()
                compute_pilot = pilot_col.find_and_modify(
                    query={"pilotmanager": self.pilot_manager_id, "state": PENDING_LAUNCH},
                    update={
                        "$set": {"state": LAUNCHING},
                        "$push": {"statehistory": {"state": LAUNCHING, "timestamp": ts}},
                    },
                )

                if not compute_pilot:
                    time.sleep(IDLE_TIMER)

                else:
                    try:
                        # ------------------------------------------------------
                        #
                        # LAUNCH THE PILOT AGENT VIA SAGA
                        #
                        logentries = []
                        pilot_id = str(compute_pilot["_id"])

                        logger.info("Launching ComputePilot %s" % pilot_id)

                        # ------------------------------------------------------
                        # Database connection parameters
                        session_uid = self.db_connection_info.session_id
                        database_url = self.db_connection_info.dburl
                        database_name = self.db_connection_info.dbname
                        database_auth = self.db_connection_info.dbauth

                        # ------------------------------------------------------
                        # pilot description and resource configuration
                        number_cores = compute_pilot["description"]["cores"]
                        runtime = compute_pilot["description"]["runtime"]
                        queue = compute_pilot["description"]["queue"]
                        project = compute_pilot["description"]["project"]
                        cleanup = compute_pilot["description"]["cleanup"]
                        resource_key = compute_pilot["description"]["resource"]
                        schema = compute_pilot["description"]["access_schema"]
                        memory = compute_pilot["description"]["memory"]
                        pilot_sandbox = compute_pilot["sandbox"]
                        global_sandbox = compute_pilot["global_sandbox"]

                        # we expand and exchange keys in the resource config,
                        # depending on the selected schema so better use a deep
                        # copy..
                        resource_cfg = self._session.get_resource_config(resource_key, schema)

                        # import pprint
                        # pprint.pprint (resource_cfg)

                        # ------------------------------------------------------
                        # get parameters from cfg, set defaults where needed
                        agent_mongodb_endpoint = resource_cfg.get("agent_mongodb_endpoint", database_url)
                        agent_spawner = resource_cfg.get("agent_spawner", DEFAULT_AGENT_SPAWNER)
                        agent_type = resource_cfg.get("agent_type", DEFAULT_AGENT_TYPE)
                        agent_scheduler = resource_cfg.get("agent_scheduler")
                        tunnel_bind_device = resource_cfg.get("tunnel_bind_device")
                        default_queue = resource_cfg.get("default_queue")
                        forward_tunnel_endpoint = resource_cfg.get("forward_tunnel_endpoint")
                        js_endpoint = resource_cfg.get("job_manager_endpoint")
                        lrms = resource_cfg.get("lrms")
                        mpi_launch_method = resource_cfg.get("mpi_launch_method")
                        pre_bootstrap = resource_cfg.get("pre_bootstrap")
                        python_interpreter = resource_cfg.get("python_interpreter")
                        spmd_variation = resource_cfg.get("spmd_variation")
                        task_launch_method = resource_cfg.get("task_launch_method")
                        rp_version = resource_cfg.get("rp_version", DEFAULT_RP_VERSION)
                        virtenv_mode = resource_cfg.get("virtenv_mode", DEFAULT_VIRTENV_MODE)
                        virtenv = resource_cfg.get("virtenv", DEFAULT_VIRTENV)
                        stage_cacerts = resource_cfg.get("stage_cacerts", "False")

                        if stage_cacerts.lower() == "true":
                            stage_cacerts = True
                        else:
                            stage_cacerts = False

                        # expand variables in virtenv string
                        virtenv = virtenv % {
                            "pilot_sandbox": saga.Url(pilot_sandbox).path,
                            "global_sandbox": saga.Url(global_sandbox).path,
                        }

                        # Check for deprecated global_virtenv
                        global_virtenv = resource_cfg.get("global_virtenv")
                        if global_virtenv:
                            logger.warn("'global_virtenv' keyword is deprecated -- use 'virtenv' and 'virtenv_mode'")
                            virtenv = global_virtenv
                            virtenv_mode = "use"

                        # set default scheme, host, port and dbname if not set
                        db_url = saga.Url(agent_mongodb_endpoint)
                        if not db_url.scheme:
                            db_url.scheme = "mongodb"
                        if not db_url.host:
                            db_url.host = "localhost"
                        if not db_url.port:
                            db_url.port = 27017
                        if not database_name:
                            database_name = "radicalpilot"

                        # Create a host:port string for use by the bootstrapper.
                        database_hostport = "%s:%d" % (db_url.host, db_url.port)

                        # ------------------------------------------------------
                        # Copy the bootstrap shell script.  This also creates
                        # the sandbox. We use always "default_bootstrapper.sh"
                        bootstrapper = "default_bootstrapper.sh"
                        bootstrapper_path = os.path.abspath("%s/../bootstrapper/%s" % (mod_dir, bootstrapper))

                        msg = "Using bootstrapper %s" % bootstrapper_path
                        logentries.append(Logentry(msg, logger=logger.info))

                        bs_script_url = saga.Url("file://localhost/%s" % bootstrapper_path)
                        bs_script_tgt = saga.Url("%s/pilot_bootstrapper.sh" % pilot_sandbox)

                        msg = "Copying bootstrapper '%s' to agent sandbox (%s)." % (bs_script_url, bs_script_tgt)
                        logentries.append(Logentry(msg, logger=logger.debug))

                        bs_script = saga.filesystem.File(bs_script_url, session=self._session)
                        bs_script.copy(bs_script_tgt, flags=saga.filesystem.CREATE_PARENTS)
                        bs_script.close()

                        # ------------------------------------------------------
                        # the version of the agent is derived from
                        # rp_version, which has the following format
                        # and interpretation:
                        #
                        # case rp_version:
                        #   @<token>:
                        #   @tag/@branch/@commit: # no sdist staging
                        #       git clone $github_base radical.pilot.src
                        #       (cd radical.pilot.src && git checkout token)
                        #       pip install -t $VIRTENV/rp_install/ radical.pilot.src
                        #       rm -rf radical.pilot.src
                        #       export PYTHONPATH=$VIRTENV/rp_install:$PYTHONPATH
                        #
                        #   release: # no sdist staging
                        #       pip install -t $VIRTENV/rp_install radical.pilot
                        #       export PYTHONPATH=$VIRTENV/rp_install:$PYTHONPATH
                        #
                        #   local: # needs sdist staging
                        #       tar zxf $sdist.tgz
                        #       pip install -t $VIRTENV/rp_install $sdist/
                        #       export PYTHONPATH=$VIRTENV/rp_install:$PYTHONPATH
                        #
                        #   debug: # needs sdist staging
                        #       tar zxf $sdist.tgz
                        #       pip install -t $SANDBOX/rp_install $sdist/
                        #       export PYTHONPATH=$SANDBOX/rp_install:$PYTHONPATH
                        #
                        #   installed: # no sdist staging
                        #       true
                        # esac
                        #
                        # virtenv_mode
                        #   private : error  if ve exists, otherwise create, then use
                        #   update  : update if ve exists, otherwise create, then use
                        #   create  : use    if ve exists, otherwise create, then use
                        #   use     : use    if ve exists, otherwise error,  then exit
                        #   recreate: delete if ve exists, otherwise create, then use
                        #
                        # examples   :
                        #   [email protected]
                        #   virtenv@devel
                        #   virtenv@release
                        #   virtenv@installed
                        #   stage@local
                        #   stage@/tmp/my_agent.py
                        #
                        # Note that some combinations may be invalid,
                        # specifically in the context of virtenv_mode.  If, for
                        # example, virtenv_mode is 'use', then the 'virtenv:tag'
                        # will not make sense, as the virtenv is not updated.
                        # In those cases, the virtenv_mode is honored, and
                        # a warning is printed.
                        #
                        # Also, the 'stage' mode can only be combined with the
                        # 'local' source, or with a path to the agent (relative
                        # to mod_dir, or absolute).
                        #
                        # A rp_version which does not adhere to the
                        # above syntax is ignored, and the fallback stage@local
                        # is used.

                        if not rp_version.startswith("@") and not rp_version in ["installed", "local", "debug"]:
                            raise ValueError("invalid rp_version '%s'" % rp_version)

                        stage_sdist = True
                        if rp_version in ["installed", "release"]:
                            stage_sdist = False

                        if rp_version.startswith("@"):
                            stage_sdist = False
                            rp_version = rp_version[1:]  # strip '@'

                        # ------------------------------------------------------
                        # Copy the rp sdist if needed.  We actually also stage
                        # the sdists for radical.utils and radical.saga, so that
                        # we have the complete stack to install...
                        if stage_sdist:

                            for path in [ru.sdist_path, saga.sdist_path, sdist_path]:

                                sdist_url = saga.Url("file://localhost/%s" % path)
                                msg = "Copying sdist '%s' to sdist sandbox (%s)." % (sdist_url, pilot_sandbox)
                                logentries.append(Logentry(msg, logger=logger.debug))

                                sdist_file = saga.filesystem.File(sdist_url)
                                sdist_file.copy("%s/" % (str(pilot_sandbox)))
                                sdist_file.close()

                        # ------------------------------------------------------
                        # some machines cannot run pip due to outdated ca certs.
                        # For those, we also stage an updated cert bundle
                        if stage_cacerts:
                            cc_path = os.path.abspath("%s/../bootstrapper/%s" % (mod_dir, "cacert.pem.gz"))

                            cc_script_url = saga.Url("file://localhost/%s" % cc_path)
                            cc_script_tgt = saga.Url("%s/cacert.pem.gz" % pilot_sandbox)

                            cc_script = saga.filesystem.File(cc_script_url, session=self._session)
                            cc_script.copy(cc_script_tgt, flags=saga.filesystem.CREATE_PARENTS)
                            cc_script.close()

                        # ------------------------------------------------------
                        # sanity checks
                        if not agent_spawner:
                            raise RuntimeError("missing agent spawner")
                        if not agent_scheduler:
                            raise RuntimeError("missing agent scheduler")
                        if not lrms:
                            raise RuntimeError("missing LRMS")
                        if not mpi_launch_method:
                            raise RuntimeError("missing mpi launch method")
                        if not task_launch_method:
                            raise RuntimeError("missing task launch method")

                        # massage some values
                        debug_level = os.environ.get("RADICAL_PILOT_AGENT_VERBOSE", logger.level)
                        try:
                            debug_level = int(debug_level)
                        except ValueError:
                            debug_level = {
                                "CRITICAL": 1,
                                "ERROR": 2,
                                "WARNING": 3,
                                "WARN": 3,
                                "INFO": 4,
                                "DEBUG": 5,
                            }.get(debug_level, 0)

                        if not queue:
                            queue = default_queue

                        if cleanup and isinstance(cleanup, bool):
                            cleanup = "luve"  #  l : log files
                            #  u : unit work dirs
                            #  v : virtualenv
                            #  e : everything (== pilot sandbox)
                            #
                            # we never cleanup virtenvs which are not private
                            if virtenv_mode is not "private":
                                cleanup = cleanup.replace("v", "")

                        sdists = ":".join([ru.sdist_name, saga.sdist_name, sdist_name])

                        # set mandatory args
                        bootstrap_args = ""
                        bootstrap_args += " -b '%s'" % sdists
                        bootstrap_args += " -c '%s'" % number_cores
                        bootstrap_args += " -d '%s'" % debug_level
                        bootstrap_args += " -g '%s'" % virtenv
                        bootstrap_args += " -j '%s'" % task_launch_method
                        bootstrap_args += " -k '%s'" % mpi_launch_method
                        bootstrap_args += " -l '%s'" % lrms
                        bootstrap_args += " -m '%s'" % database_hostport
                        bootstrap_args += " -n '%s'" % database_name
                        bootstrap_args += " -o '%s'" % agent_spawner
                        bootstrap_args += " -p '%s'" % pilot_id
                        bootstrap_args += " -q '%s'" % agent_scheduler
                        bootstrap_args += " -r '%s'" % runtime
                        bootstrap_args += " -s '%s'" % session_uid
                        bootstrap_args += " -t '%s'" % agent_type
                        bootstrap_args += " -u '%s'" % virtenv_mode
                        bootstrap_args += " -v '%s'" % rp_version

                        # set optional args
                        if database_auth:
                            bootstrap_args += " -a '%s'" % database_auth
                        if tunnel_bind_device:
                            bootstrap_args += " -D '%s'" % tunnel_bind_device
                        if pre_bootstrap:
                            bootstrap_args += " -e '%s'" % "' -e '".join(pre_bootstrap)
                        if forward_tunnel_endpoint:
                            bootstrap_args += " -f '%s'" % forward_tunnel_endpoint
                        if python_interpreter:
                            bootstrap_args += " -i '%s'" % python_interpreter
                        if cleanup:
                            bootstrap_args += " -x '%s'" % cleanup

                        # ------------------------------------------------------
                        # now that the script is in place and we know where it is,
                        # we can launch the agent
                        js_url = saga.Url(js_endpoint)
                        logger.debug("saga.job.Service ('%s')" % js_url)
                        if js_url in self._shared_worker_data["job_services"]:
                            js = self._shared_worker_data["job_services"][js_url]
                        else:
                            js = saga.job.Service(js_url, session=self._session)
                            self._shared_worker_data["job_services"][js_url] = js

                        # ------------------------------------------------------
                        # Create SAGA Job description and submit the pilot job

                        jd = saga.job.Description()

                        jd.executable = "/bin/bash"
                        jd.arguments = ["-l pilot_bootstrapper.sh", bootstrap_args]
                        jd.working_directory = saga.Url(pilot_sandbox).path
                        jd.project = project
                        jd.output = "agent.out"
                        jd.error = "agent.err"
                        jd.total_cpu_count = number_cores
                        jd.wall_time_limit = runtime
                        jd.total_physical_memory = memory
                        jd.queue = queue

                        # Set the SPMD variation only if required
                        if spmd_variation:
                            jd.spmd_variation = spmd_variation

                        if "RADICAL_PILOT_PROFILE" in os.environ:
                            jd.environment = {"RADICAL_PILOT_PROFILE": "TRUE"}

                        logger.debug("Bootstrap command line: %s %s" % (jd.executable, jd.arguments))

                        msg = "Submitting SAGA job with description: %s" % str(jd.as_dict())
                        logentries.append(Logentry(msg, logger=logger.debug))

                        pilotjob = js.create_job(jd)
                        pilotjob.run()

                        # do a quick error check
                        if pilotjob.state == saga.FAILED:
                            raise RuntimeError("SAGA Job state is FAILED.")

                        saga_job_id = pilotjob.id
                        self._shared_worker_data["job_ids"][pilot_id] = [saga_job_id, js_url]

                        msg = "SAGA job submitted with job id %s" % str(saga_job_id)
                        logentries.append(Logentry(msg, logger=logger.debug))

                        #
                        # ------------------------------------------------------

                        log_dicts = list()
                        for le in logentries:
                            log_dicts.append(le.as_dict())

                        # Update the Pilot's state to 'PENDING_ACTIVE' if SAGA job submission was successful.
                        ts = datetime.datetime.utcnow()
                        ret = pilot_col.update(
                            {"_id": pilot_id, "state": "Launching"},
                            {
                                "$set": {"state": PENDING_ACTIVE, "saga_job_id": saga_job_id},
                                "$push": {"statehistory": {"state": PENDING_ACTIVE, "timestamp": ts}},
                                "$pushAll": {"log": log_dicts},
                            },
                        )

                        if ret["n"] == 0:
                            # could not update, probably because the agent is
                            # running already.  Just update state history and
                            # jobid then
                            # FIXME: make sure of the agent state!
                            ret = pilot_col.update(
                                {"_id": pilot_id},
                                {
                                    "$set": {"saga_job_id": saga_job_id},
                                    "$push": {"statehistory": {"state": PENDING_ACTIVE, "timestamp": ts}},
                                    "$pushAll": {"log": log_dicts},
                                },
                            )

                    except Exception as e:
                        # Update the Pilot's state 'FAILED'.
                        out, err, log = self._get_pilot_logs(pilot_col, pilot_id)
                        ts = datetime.datetime.utcnow()

                        # FIXME: we seem to be unable to bson/json handle saga
                        # log messages containing an '#'.  This shows up here.
                        # Until we find a clean workaround, make log shorter and
                        # rely on saga logging to reveal the problem.
                        msg = "Pilot launching failed! (%s)" % e
                        logentries.append(Logentry(msg))

                        log_dicts = list()
                        log_messages = list()
                        for le in logentries:
                            log_dicts.append(le.as_dict())
                            log_messages.append(le.message)

                        pilot_col.update(
                            {"_id": pilot_id, "state": {"$ne": FAILED}},
                            {
                                "$set": {"state": FAILED, "stdout": out, "stderr": err, "logfile": log},
                                "$push": {"statehistory": {"state": FAILED, "timestamp": ts}},
                                "$pushAll": {"log": log_dicts},
                            },
                        )
                        logger.exception("\n".join(log_messages))

        except SystemExit as e:
            logger.exception("pilot launcher thread caught system exit -- forcing application shutdown")
            import thread

            thread.interrupt_main()