Example #1
0
    def unschedule(self, units):
        """Unschedule one or more ComputeUnits"""

        logger.warn("scheduler %s does not implement 'unschedule()'" %
                    self.name)
Example #2
0
    def pilot_added(self, pilot):
        """Inform the scheduler about a new pilot"""

        logger.warn("scheduler %s does not implement 'pilot_added()'" %
                    self.name)
Example #3
0
    def pilot_removed(self, pilot):
        """Inform the scheduler about a pilot removal"""

        logger.warn("scheduler %s does not implement 'pilot_removed()'" %
                    self.name)
Example #4
0
    def register_cancel_pilots_request(self, pilot_ids=None):
        """Registers one or more pilots for cancelation.
        """

        if pilot_ids is None:

            pilot_ids = list()

            for pilot in self._db.get_pilots(pilot_manager_id=self._pm_id) :
                pilot_ids.append (str(pilot["_id"]))


        self._db.send_command_to_pilot(COMMAND_CANCEL_PILOT, pilot_ids=pilot_ids)
        logger.info("Sent 'COMMAND_CANCEL_PILOT' command to pilots %s.", pilot_ids)

        # pilots which are in ACTIVE state should now have time to react on the
        # CANCEL command sent above.  Meanwhile, we'll cancel all pending
        # pilots.  If that is done, we wait a little, say 10 seconds, to give
        # the pilot time to pick up the request and shut down -- but if it does
        # not do that, it will get killed the hard way...
        delayed_cancel = list()

        for pilot_id in pilot_ids :
            if  pilot_id in self._shared_data :

                # read state fomr _shared_data only once, so that it does not
                # change under us...
                old_state = str(self._shared_data[pilot_id]["data"]["state"])

                logger.warn ("actively cancel pilot %s state: %s" % (pilot_id, old_state))
                if  old_state in [DONE, FAILED, CANCELED] :
                    logger.warn ("can't actively cancel pilot %s: already in final state" % pilot_id)

                elif old_state in [PENDING_LAUNCH, LAUNCHING, PENDING_ACTIVE] :
                    if pilot_id in self._shared_worker_data['job_ids'] :

                        try :
                            job_id, js_url = self._shared_worker_data['job_ids'][pilot_id]
                            self._shared_data[pilot_id]["data"]["state"] = CANCELING
                            logger.info ("actively cancel pilot %s (%s, %s)" % (pilot_id, job_id, js_url))

                            js = self._shared_worker_data['job_services'][js_url]
                            job = js.get_job (job_id)
                            job.cancel ()
                        except Exception as e :
                            logger.exception ('pilot cancelation failed')


                    else :
                        logger.warn ("can't actively cancel pilot %s: no job id known" % pilot_id)
                        logger.debug (pprint.pformat (self._shared_worker_data))

                else :
                    logger.debug ("delay to actively cancel pilot %s: state %s" % (pilot_id, old_state))
                    delayed_cancel.append (pilot_id)

            else :
                logger.warn  ("can't actively cancel pilot %s: unknown pilot" % pilot_id)
                logger.debug (pprint.pformat (self._shared_data))

        # now tend to all delayed cancellation requests (ie. active pilots) --
        # if there are any
        if  delayed_cancel :

            # grant some levay to the unruly children...
            time.sleep (10)

            for pilot_id in delayed_cancel :

                if pilot_id in self._shared_worker_data['job_ids'] :

                    try :
                        job_id, js_url = self._shared_worker_data['job_ids'][pilot_id]
                        logger.info ("actively cancel pilot %s (delayed) (%s, %s)" % (pilot_id, job_id, js_url))

                        js = self._shared_worker_data['job_services'][js_url]
                        job = js.get_job (job_id)
                        job.cancel ()
                    except Exception as e :
                        logger.warn ('delayed pilot cancelation failed. '
                                'This is not necessarily a problem.')

                else :
                    logger.warn ("can't actively cancel pilot %s: no job id known (delayed)" % pilot_id)
                    logger.debug (pprint.pformat (self._shared_worker_data))
Example #5
0
    def handle_schedule (self, schedule) :

        # we want to use bulk submission to the pilots, so we collect all units
        # assigned to the same set of pilots.  At the same time, we select
        # unscheduled units for later insertion into the wait queue.
        
        if  not schedule :
            logger.debug ('skipping empty unit schedule')
            return

      # print 'handle schedule:'
      # import pprint
      # pprint.pprint (schedule)
      #
        pilot_cu_map = dict()
        unscheduled  = list()

        pilot_ids = self.list_pilots ()

        for unit in schedule['units'].keys() :

            pid = schedule['units'][unit]

            if  None == pid :
                unscheduled.append (unit)
                continue

            else :

                if  pid not in pilot_ids :
                    raise RuntimeError ("schedule points to unknown pilot %s" % pid)

                if  pid not in pilot_cu_map :
                    pilot_cu_map[pid] = list()

                pilot_cu_map[pid].append (unit)


        # submit to all pilots which got something submitted to
        for pid in pilot_cu_map.keys():

            units_to_schedule = list()

            # if a kernel name is in the cu descriptions set, do kernel expansion
            for unit in pilot_cu_map[pid] :

                if  not pid in schedule['pilots'] :
                    # lost pilot, do not schedule unit
                    logger.warn ("unschedule unit %s, lost pilot %s" % (unit.uid, pid))
                    continue

                unit.sandbox = schedule['pilots'][pid]['sandbox'] + "/" + str(unit.uid)

                ud = unit.description

                if  'kernel' in ud and ud['kernel'] :

                    try :
                        from radical.ensemblemd.mdkernels import MDTaskDescription
                    except Exception as ex :
                        logger.error ("Kernels are not supported in" \
                              "compute unit descriptions -- install " \
                              "radical.ensemblemd.mdkernels!")
                        # FIXME: unit needs a '_set_state() method or something!
                        self._session._dbs.set_compute_unit_state (unit._uid, FAILED, 
                                ["kernel expansion failed"])
                        continue

                    pilot_resource = schedule['pilots'][pid]['resource']

                    mdtd           = MDTaskDescription ()
                    mdtd.kernel    = ud.kernel
                    mdtd_bound     = mdtd.bind (resource=pilot_resource)
                    ud.environment = mdtd_bound.environment
                    ud.pre_exec    = mdtd_bound.pre_exec
                    ud.executable  = mdtd_bound.executable
                    ud.mpi         = mdtd_bound.mpi


                units_to_schedule.append (unit)

            if  len(units_to_schedule) :
                self._worker.schedule_compute_units (pilot_uid=pid,
                                                     units=units_to_schedule)


        # report any change in wait_queue_size
        old_wait_queue_size = self.wait_queue_size

        self.wait_queue_size = len(unscheduled)
        if  old_wait_queue_size != self.wait_queue_size :
            self._worker.fire_manager_callback (WAIT_QUEUE_SIZE, self,
                                                self.wait_queue_size)

        if  len(unscheduled) :
            self._worker.unschedule_compute_units (units=unscheduled)

        logger.info ('%s units remain unscheduled' % len(unscheduled))
Example #6
0
    def _unit_state_callback (self, unit, state) :
        
        try :

            with self.lock :
            
                uid = unit.uid

                logger.info ("[SchedulerCallback]: Computeunit %s changed to %s" % (uid, state))


                found_unit = False
                if  state in [NEW, UNSCHEDULED] :

                    for pid in self.runqs :

                        if  not pid :
                            logger.warning ('cannot handle final unit %s w/o pilot information' % uid)

                        if  uid in self.runqs[pid] :

                            logger.info ('reschedule NEW unit %s from %s' % (uid, pid))

                            unit       = self.runqs[pid][uid]
                            found_unit = True

                            del self.runqs[pid][uid]
                            self.waitq[uid] = unit

                          # self._dump ('before reschedule %s' % uid)
                            self._reschedule (uid=uid)
                          # self._dump ('after  reschedule %s' % uid)

                            return

              # if  not found_unit and uid not in self.waitq :
              #     # as we cannot unregister callbacks, we simply ignore this
              #     # invokation.  Its probably from a unit we handled previously.
              #     # (although this should have been final?)
              #     #
              #     # FIXME: how can I *un*register a unit callback?
              #     logger.error ("[SchedulerCallback]: cannot handle unit %s" % uid)
              #     self._dump()
              #     return

                if  state in [PENDING_OUTPUT_STAGING, STAGING_OUTPUT, DONE, FAILED, CANCELED] :
                    # the pilot which owned this CU should now have free slots available
                    # FIXME: how do I get the pilot from the CU?
                    
                    pid = unit.execution_details.get ('pilot', None)

                    if  not pid :
                        raise RuntimeError ('cannot handle final unit %s w/o pilot information' % uid)

                    if  pid not in self.pilots :
                        logger.warning ('cannot handle unit %s cb for pilot %s (pilot is gone)' % (uid, pid))

                    else :
                        if  uid in self.runqs[pid] :

                            unit = self.runqs[pid][uid]

                            del self.runqs[pid][uid]
                            self.pilots[pid]['caps'] += unit.description.cores
                            self._reschedule (target_pid=pid)
                            found_unit = True

                      #     logger.debug ('unit %s frees %s cores on (-> %s)' \
                      #                % (uid, unit.description.cores, pid, self.pilots[pid]['caps']))

                    if not found_unit :
                        logger.warn ('unit %s freed %s cores on %s (== %s) -- not reused'
                                  % (uid, unit.description.cores, pid, self.pilots[pid]['caps']))


        except Exception as e :
            logger.error ("error in unit callback for backfiller (%s) - ignored" % e)
    def handle_schedule(self, schedule):

        # we want to use bulk submission to the pilots, so we collect all units
        # assigned to the same set of pilots.  At the same time, we select
        # unscheduled units for later insertion into the wait queue.

        if not schedule:
            logger.debug('skipping empty unit schedule')
            return

    # print 'handle schedule:'
    # import pprint
    # pprint.pprint (schedule)
    #
        pilot_cu_map = dict()
        unscheduled = list()

        pilot_ids = self.list_pilots()

        for unit in schedule['units'].keys():

            pid = schedule['units'][unit]

            if None == pid:
                unscheduled.append(unit)
                continue

            else:

                if pid not in pilot_ids:
                    raise RuntimeError("schedule points to unknown pilot %s" %
                                       pid)

                if pid not in pilot_cu_map:
                    pilot_cu_map[pid] = list()

                pilot_cu_map[pid].append(unit)

        # submit to all pilots which got something submitted to
        for pid in pilot_cu_map.keys():

            units_to_schedule = list()

            # if a kernel name is in the cu descriptions set, do kernel expansion
            for unit in pilot_cu_map[pid]:

                if not pid in schedule['pilots']:
                    # lost pilot, do not schedule unit
                    logger.warn("unschedule unit %s, lost pilot %s" %
                                (unit.uid, pid))
                    continue

                unit.sandbox = schedule['pilots'][pid]['sandbox'] + "/" + str(
                    unit.uid)

                ud = unit.description

                if 'kernel' in ud and ud['kernel']:

                    try:
                        from radical.ensemblemd.mdkernels import MDTaskDescription
                    except Exception as ex:
                        logger.error ("Kernels are not supported in" \
                              "compute unit descriptions -- install " \
                              "radical.ensemblemd.mdkernels!")
                        # FIXME: unit needs a '_set_state() method or something!
                        self._session._dbs.set_compute_unit_state(
                            unit._uid, FAILED, ["kernel expansion failed"])
                        continue

                    pilot_resource = schedule['pilots'][pid]['resource']

                    mdtd = MDTaskDescription()
                    mdtd.kernel = ud.kernel
                    mdtd_bound = mdtd.bind(resource=pilot_resource)
                    ud.environment = mdtd_bound.environment
                    ud.pre_exec = mdtd_bound.pre_exec
                    ud.executable = mdtd_bound.executable
                    ud.mpi = mdtd_bound.mpi

                units_to_schedule.append(unit)

            if len(units_to_schedule):
                self._worker.schedule_compute_units(pilot_uid=pid,
                                                    units=units_to_schedule)

        # report any change in wait_queue_size
        old_wait_queue_size = self.wait_queue_size

        self.wait_queue_size = len(unscheduled)
        if old_wait_queue_size != self.wait_queue_size:
            self._worker.fire_manager_callback(WAIT_QUEUE_SIZE, self,
                                               self.wait_queue_size)

        if len(unscheduled):
            self._worker.unschedule_compute_units(units=unscheduled)

        logger.info('%s units remain unscheduled' % len(unscheduled))
    def schedule_compute_units(self, pilot_uid, units):
        """Request the scheduling of one or more ComputeUnits on a
           ComputePilot.
        """

        try:
            cu_transfer   = list()
            cu_notransfer = list()

            # Get some information about the pilot sandbox from the database.
            pilot_info = self._db.get_pilots(pilot_ids=pilot_uid)
            # TODO: this hack below relies on what?! That there is just one pilot?
            pilot_sandbox = pilot_info[0]['sandbox']

            # Split units into two different lists: the first list contains the CUs
            # that need file transfer and the second list contains the CUs that
            # don't. The latter is added to the pilot directly, while the former
            # is added to the transfer queue.
            for unit in units:

                # Create object for staging status tracking
                unit.FTW_Input_Status = None
                unit.FTW_Input_Directives = []
                unit.Agent_Input_Status = None
                unit.Agent_Input_Directives = []
                unit.FTW_Output_Status = None
                unit.FTW_Output_Directives = []
                unit.Agent_Output_Status = None
                unit.Agent_Output_Directives = []

                # Split the input staging directives over the transfer worker and the agent
                input_sds = unit.description.input_staging
                if not isinstance(input_sds, list):
                    # Ugly, but is a workaround for iterating on attribute interface
                    # TODO: Verify if this piece of code is actually still required
                    if input_sds:
                        input_sds = [input_sds]
                    else:
                        input_sds = []

                for input_sd_entry in input_sds:
                    action = input_sd_entry['action']
                    source = Url(input_sd_entry['source'])
                    target = Url(input_sd_entry['target'])

                    new_sd = {'action':   action,
                              'source':   str(source),
                              'target':   str(target),
                              'flags':    input_sd_entry['flags'],
                              'priority': input_sd_entry['priority'],
                              'state':    PENDING
                    }

                    if action in [LINK, COPY, MOVE]:
                        unit.Agent_Input_Directives.append(new_sd)
                        unit.Agent_Input_Status = PENDING
                    elif action in [TRANSFER]:
                        if source.scheme and source.scheme != 'file':
                            # If there is a scheme and it is different than "file",
                            # assume a remote pull from the agent
                            unit.Agent_Input_Directives.append(new_sd)
                            unit.Agent_Input_Status = PENDING
                        else:
                            # Transfer from local to sandbox
                            unit.FTW_Input_Directives.append(new_sd)
                            unit.FTW_Input_Status = PENDING
                    else:
                        logger.warn('Not sure if action %s makes sense for input staging' % action)

                # Split the output staging directives over the transfer worker and the agent
                output_sds = unit.description.output_staging
                if not isinstance(output_sds, list):
                    # Ugly, but is a workaround for iterating on att iface
                    # TODO: Verify if this piece of code is actually still required
                    if output_sds:
                        output_sds = [output_sds]
                    else:
                        output_sds = []

                for output_sds_entry in output_sds:
                    action = output_sds_entry['action']
                    source = Url(output_sds_entry['source'])
                    target = Url(output_sds_entry['target'])

                    new_sd = {'action':   action,
                              'source':   str(source),
                              'target':   str(target),
                              'flags':    output_sds_entry['flags'],
                              'priority': output_sds_entry['priority'],
                              'state':    PENDING
                    }

                    if action == LINK or action == COPY or action == MOVE:
                        unit.Agent_Output_Directives.append(new_sd)
                        unit.Agent_Output_Status = NEW
                    elif action == TRANSFER:
                        if target.scheme and target.scheme != 'file':
                            # If there is a scheme and it is different than "file",
                            # assume a remote push from the agent
                            unit.Agent_Output_Directives.append(new_sd)
                            unit.Agent_Output_Status = NEW
                        else:
                            # Transfer from sandbox back to local
                            unit.FTW_Output_Directives.append(new_sd)
                            unit.FTW_Output_Status = NEW
                    else:
                        logger.warn('Not sure if action %s makes sense for output staging' % action)

                if unit.FTW_Input_Directives or unit.Agent_Input_Directives:
                    log = "Scheduled for data transfer to ComputePilot %s." % pilot_uid
                    self._db.set_compute_unit_state(unit.uid, PENDING_INPUT_STAGING, log)
                    cu_transfer.append(unit)
                else:
                    cu_notransfer.append(unit)

            # Bulk-add all non-transfer units-
            self._db.assign_compute_units_to_pilot(
                units=cu_notransfer,
                pilot_uid=pilot_uid,
                pilot_sandbox=pilot_sandbox
            )

            self._db.assign_compute_units_to_pilot(
                units=cu_transfer,
                pilot_uid=pilot_uid,
                pilot_sandbox=pilot_sandbox
            )

            for unit in cu_notransfer:
                log = "Scheduled for execution on ComputePilot %s." % pilot_uid
                self._db.set_compute_unit_state(unit.uid, PENDING_EXECUTION, log)
                #self._set_state(uid, PENDING_EXECUTION, log)

            logger.info(
                "Scheduled ComputeUnits %s for execution on ComputePilot '%s'." %
                (cu_notransfer, pilot_uid)
            )
        except Exception, e:
            logger.exception ('error in unit manager controller (schedule())')
            raise
Example #9
0
    def _pilot_state_callback (self, pilot, state) :
        
        try :

            with self.lock :

                pid = pilot.uid
    
                if  not pid in self.pilots :
                    # as we cannot unregister callbacks, we simply ignore this
                    # invokation.  Its probably from a pilot we used previously.
                    logger.warn ("[SchedulerCallback]: ComputePilot %s changed to %s (ignored)" % (pid, state))
                    return
    
    
                self.pilots[pid]['state'] = state
                logger.debug ("[SchedulerCallback]: ComputePilot %s changed to %s" % (pid, state))
    
                if  state in [ACTIVE] :
                    # the pilot is now ready to be used
                    self._reschedule (target_pid=pid)
    
                if  state in [DONE, FAILED, CANCELED] :

                  # self._dump ('pilot is final')

                    # If the pilot state is 'DONE', 'FAILED' or 'CANCELED', we
                    # need to reschedule the units which are reschedulable --
                    # all others are marked 'FAILED' if they are already
                    # 'EXECUTING' and not restartable
                    timestamp = datetime.datetime.utcnow()
                    self._db.change_compute_units (
                        filter_dict = {"pilot"       : pid, 
                                       "state"       : {"$in": [UNSCHEDULED,
                                                                PENDING_INPUT_STAGING, 
                                                                STAGING_INPUT, 
                                                                PENDING_EXECUTION, 
                                                                SCHEDULING]}},
                        set_dict    = {"state"       : UNSCHEDULED, 
                                       "pilot"       : None},
                        push_dict   = {"statehistory": {"state"     : UNSCHEDULED, 
                                                        "timestamp" : timestamp}, 
                                       "log"         : {"message"   :  "reschedule unit", 
                                                        "timestamp" : timestamp}
                                      })

                    self._db.change_compute_units (
                        filter_dict = {"pilot"       : pid, 
                                       "restartable" : True, 
                                       "state"       : {"$in": [EXECUTING, 
                                                                PENDING_OUTPUT_STAGING, 
                                                                STAGING_OUTPUT]}},
                        set_dict    = {"state"       : UNSCHEDULED,
                                       "pilot"       : None},
                        push_dict   = {"statehistory": {"state"     : UNSCHEDULED,
                                                        "timestamp" : timestamp}, 
                                       "log"         : {"message"   :  "reschedule unit", 
                                                        "timestamp" : timestamp}
                                      })

                    self._db.change_compute_units (
                        filter_dict = {"pilot"       : pid, 
                                       "restartable" : False, 
                                       "state"       : {"$in": [EXECUTING, 
                                                                PENDING_OUTPUT_STAGING, 
                                                                STAGING_OUTPUT]}},
                        set_dict    = {"state"       : FAILED},
                        push_dict   = {"statehistory": {"state"     : FAILED, 
                                                        "timestamp" : timestamp}, 
                                       "log"         : {"message"   :  "reschedule unit", 
                                                        "timestamp" : timestamp}
                                      })

                        # make sure that restartable units got back into the
                        # wait queue
                        #
                        # FIXME AM: f*****g state management: I don't have the
                        # unit state!  New state was just pushed to the DB, but
                        # I have actually no idea for which units, and the state
                        # known to the worker (i.e. the cached state) is most
                        # likely outdated. 
                        #
                        # So we don't handle runq/waitq here.  Instead, we rely
                        # on the unit cb to get invoked as soon as the state
                        # propagated back to us, and then remove them from the
                        # runq.  This is slow, potentially very slow, but save.
                        

                    # we can't use this pilot anymore...  
                    del self.pilots[pid]
                    # FIXME: how can I *un*register a pilot callback?
                    
    
        except Exception as e :
          # import traceback
          # traceback.print_exc ()
            logger.exception ("error in pilot callback for backfiller (%s) - ignored" % e)
            raise
    def run(self):
        """Starts the process when Process.start() is called.
        """

        # make sure to catch sys.exit (which raises SystemExit)
        try:
            # Get directory where this module lives
            mod_dir = os.path.dirname(os.path.realpath(__file__))

            # Try to connect to the database
            try:
                connection = self.db_connection_info.get_db_handle()
                db = connection[self.db_connection_info.dbname]
                pilot_col = db["%s.p" % self.db_connection_info.session_id]
                logger.debug(
                    "Connected to MongoDB. Serving requests for PilotManager %s."
                    % self.pilot_manager_id)

            except Exception as e:
                logger.exception("Connection error: %s" % e)
                return

            last_job_check = time.time()

            while not self._stop.is_set():

                # Periodically, we pull up all ComputePilots that are pending
                # execution or were last seen executing and check if the corresponding
                # SAGA job is still pending in the queue. If that is not the case,
                # we assume that the job has failed for some reasons and update
                # the state of the ComputePilot accordingly.
                if last_job_check + JOB_CHECK_INTERVAL < time.time():
                    last_job_check = time.time()
                    self.check_pilot_states(pilot_col)

                # See if we can find a ComputePilot that is waiting to be launched.
                # If we find one, we use SAGA to create a job service, a job
                # description and a job that is then send to the local or remote
                # queueing system. If this succedes, we set the ComputePilot's
                # state to pending, otherwise to failed.
                compute_pilot = None

                ts = datetime.datetime.utcnow()
                compute_pilot = pilot_col.find_and_modify(
                    query={
                        "pilotmanager": self.pilot_manager_id,
                        "state": PENDING_LAUNCH
                    },
                    update={
                        "$set": {
                            "state": LAUNCHING
                        },
                        "$push": {
                            "statehistory": {
                                "state": LAUNCHING,
                                "timestamp": ts
                            }
                        }
                    })

                if not compute_pilot:
                    time.sleep(IDLE_TIMER)

                else:
                    try:
                        # ------------------------------------------------------
                        #
                        # LAUNCH THE PILOT AGENT VIA SAGA
                        #
                        logentries = []
                        pilot_id = str(compute_pilot["_id"])

                        logger.info("Launching ComputePilot %s" % pilot_id)

                        # ------------------------------------------------------
                        # Database connection parameters
                        session_uid = self.db_connection_info.session_id
                        database_url = self.db_connection_info.dburl
                        database_name = self.db_connection_info.dbname
                        database_auth = self.db_connection_info.dbauth

                        # ------------------------------------------------------
                        # pilot description and resource configuration
                        number_cores = compute_pilot['description']['cores']
                        runtime = compute_pilot['description']['runtime']
                        queue = compute_pilot['description']['queue']
                        project = compute_pilot['description']['project']
                        cleanup = compute_pilot['description']['cleanup']
                        resource_key = compute_pilot['description']['resource']
                        schema = compute_pilot['description']['access_schema']
                        memory = compute_pilot['description']['memory']
                        pilot_sandbox = compute_pilot['sandbox']
                        global_sandbox = compute_pilot['global_sandbox']

                        # we expand and exchange keys in the resource config,
                        # depending on the selected schema so better use a deep
                        # copy..
                        resource_cfg = self._session.get_resource_config(
                            resource_key, schema)

                        # import pprint
                        # pprint.pprint (resource_cfg)

                        # ------------------------------------------------------
                        # get parameters from cfg, set defaults where needed
                        agent_mongodb_endpoint = resource_cfg.get(
                            'agent_mongodb_endpoint', database_url)
                        agent_spawner = resource_cfg.get(
                            'agent_spawner', DEFAULT_AGENT_SPAWNER)
                        agent_type = resource_cfg.get('agent_type',
                                                      DEFAULT_AGENT_TYPE)
                        agent_scheduler = resource_cfg.get('agent_scheduler')
                        tunnel_bind_device = resource_cfg.get(
                            'tunnel_bind_device')
                        default_queue = resource_cfg.get('default_queue')
                        forward_tunnel_endpoint = resource_cfg.get(
                            'forward_tunnel_endpoint')
                        js_endpoint = resource_cfg.get('job_manager_endpoint')
                        lrms = resource_cfg.get('lrms')
                        mpi_launch_method = resource_cfg.get(
                            'mpi_launch_method')
                        pre_bootstrap = resource_cfg.get('pre_bootstrap')
                        python_interpreter = resource_cfg.get(
                            'python_interpreter')
                        spmd_variation = resource_cfg.get('spmd_variation')
                        task_launch_method = resource_cfg.get(
                            'task_launch_method')
                        rp_version = resource_cfg.get('rp_version',
                                                      DEFAULT_RP_VERSION)
                        virtenv_mode = resource_cfg.get(
                            'virtenv_mode', DEFAULT_VIRTENV_MODE)
                        virtenv = resource_cfg.get('virtenv', DEFAULT_VIRTENV)
                        stage_cacerts = resource_cfg.get(
                            'stage_cacerts', 'False')

                        if stage_cacerts.lower() == 'true':
                            stage_cacerts = True
                        else:
                            stage_cacerts = False

                        # expand variables in virtenv string
                        virtenv = virtenv % {
                            'pilot_sandbox': saga.Url(pilot_sandbox).path,
                            'global_sandbox': saga.Url(global_sandbox).path
                        }

                        # Check for deprecated global_virtenv
                        global_virtenv = resource_cfg.get('global_virtenv')
                        if global_virtenv:
                            logger.warn(
                                "'global_virtenv' keyword is deprecated -- use 'virtenv' and 'virtenv_mode'"
                            )
                            virtenv = global_virtenv
                            virtenv_mode = 'use'

                        # set default scheme, host, port and dbname if not set
                        db_url = saga.Url(agent_mongodb_endpoint)
                        if not db_url.scheme: db_url.scheme = 'mongodb'
                        if not db_url.host: db_url.host = 'localhost'
                        if not db_url.port: db_url.port = 27017
                        if not database_name: database_name = 'radicalpilot'

                        # Create a host:port string for use by the bootstrapper.
                        database_hostport = "%s:%d" % (db_url.host,
                                                       db_url.port)

                        # ------------------------------------------------------
                        # Copy the bootstrap shell script.  This also creates
                        # the sandbox. We use always "default_bootstrapper.sh"
                        bootstrapper = 'default_bootstrapper.sh'
                        bootstrapper_path = os.path.abspath("%s/../bootstrapper/%s" \
                                % (mod_dir, bootstrapper))

                        msg = "Using bootstrapper %s" % bootstrapper_path
                        logentries.append(Logentry(msg, logger=logger.info))

                        bs_script_url = saga.Url("file://localhost/%s" %
                                                 bootstrapper_path)
                        bs_script_tgt = saga.Url("%s/pilot_bootstrapper.sh" %
                                                 pilot_sandbox)

                        msg = "Copying bootstrapper '%s' to agent sandbox (%s)." \
                                % (bs_script_url, bs_script_tgt)
                        logentries.append(Logentry(msg, logger=logger.debug))

                        bs_script = saga.filesystem.File(bs_script_url,
                                                         session=self._session)
                        bs_script.copy(bs_script_tgt,
                                       flags=saga.filesystem.CREATE_PARENTS)
                        bs_script.close()

                        # ------------------------------------------------------
                        # the version of the agent is derived from
                        # rp_version, which has the following format
                        # and interpretation:
                        #
                        # case rp_version:
                        #   @<token>:
                        #   @tag/@branch/@commit: # no sdist staging
                        #       git clone $github_base radical.pilot.src
                        #       (cd radical.pilot.src && git checkout token)
                        #       pip install -t $VIRTENV/rp_install/ radical.pilot.src
                        #       rm -rf radical.pilot.src
                        #       export PYTHONPATH=$VIRTENV/rp_install:$PYTHONPATH
                        #
                        #   release: # no sdist staging
                        #       pip install -t $VIRTENV/rp_install radical.pilot
                        #       export PYTHONPATH=$VIRTENV/rp_install:$PYTHONPATH
                        #
                        #   local: # needs sdist staging
                        #       tar zxf $sdist.tgz
                        #       pip install -t $VIRTENV/rp_install $sdist/
                        #       export PYTHONPATH=$VIRTENV/rp_install:$PYTHONPATH
                        #
                        #   debug: # needs sdist staging
                        #       tar zxf $sdist.tgz
                        #       pip install -t $SANDBOX/rp_install $sdist/
                        #       export PYTHONPATH=$SANDBOX/rp_install:$PYTHONPATH
                        #
                        #   installed: # no sdist staging
                        #       true
                        # esac
                        #
                        # virtenv_mode
                        #   private : error  if ve exists, otherwise create, then use
                        #   update  : update if ve exists, otherwise create, then use
                        #   create  : use    if ve exists, otherwise create, then use
                        #   use     : use    if ve exists, otherwise error,  then exit
                        #   recreate: delete if ve exists, otherwise create, then use
                        #
                        # examples   :
                        #   [email protected]
                        #   virtenv@devel
                        #   virtenv@release
                        #   virtenv@installed
                        #   stage@local
                        #   stage@/tmp/my_agent.py
                        #
                        # Note that some combinations may be invalid,
                        # specifically in the context of virtenv_mode.  If, for
                        # example, virtenv_mode is 'use', then the 'virtenv:tag'
                        # will not make sense, as the virtenv is not updated.
                        # In those cases, the virtenv_mode is honored, and
                        # a warning is printed.
                        #
                        # Also, the 'stage' mode can only be combined with the
                        # 'local' source, or with a path to the agent (relative
                        # to mod_dir, or absolute).
                        #
                        # A rp_version which does not adhere to the
                        # above syntax is ignored, and the fallback stage@local
                        # is used.

                        if  not rp_version.startswith('@') and \
                            not rp_version in ['installed', 'local', 'debug']:
                            raise ValueError("invalid rp_version '%s'" %
                                             rp_version)

                        stage_sdist = True
                        if rp_version in ['installed', 'release']:
                            stage_sdist = False

                        if rp_version.startswith('@'):
                            stage_sdist = False
                            rp_version = rp_version[1:]  # strip '@'

                        # ------------------------------------------------------
                        # Copy the rp sdist if needed.  We actually also stage
                        # the sdists for radical.utils and radical.saga, so that
                        # we have the complete stack to install...
                        if stage_sdist:

                            for path in [
                                    ru.sdist_path, saga.sdist_path, sdist_path
                            ]:

                                sdist_url = saga.Url("file://localhost/%s" %
                                                     path)
                                msg = "Copying sdist '%s' to sdist sandbox (%s)." % (
                                    sdist_url, pilot_sandbox)
                                logentries.append(
                                    Logentry(msg, logger=logger.debug))

                                sdist_file = saga.filesystem.File(sdist_url)
                                sdist_file.copy("%s/" % (str(pilot_sandbox)))
                                sdist_file.close()

                        # ------------------------------------------------------
                        # some machines cannot run pip due to outdated ca certs.
                        # For those, we also stage an updated cert bundle
                        if stage_cacerts:
                            cc_path = os.path.abspath("%s/../bootstrapper/%s" \
                                    % (mod_dir, 'cacert.pem.gz'))

                            cc_script_url = saga.Url("file://localhost/%s" %
                                                     cc_path)
                            cc_script_tgt = saga.Url("%s/cacert.pem.gz" %
                                                     pilot_sandbox)

                            cc_script = saga.filesystem.File(
                                cc_script_url, session=self._session)
                            cc_script.copy(
                                cc_script_tgt,
                                flags=saga.filesystem.CREATE_PARENTS)
                            cc_script.close()

                        # ------------------------------------------------------
                        # sanity checks
                        if not agent_spawner:
                            raise RuntimeError("missing agent spawner")
                        if not agent_scheduler:
                            raise RuntimeError("missing agent scheduler")
                        if not lrms: raise RuntimeError("missing LRMS")
                        if not mpi_launch_method:
                            raise RuntimeError("missing mpi launch method")
                        if not task_launch_method:
                            raise RuntimeError("missing task launch method")

                        # massage some values
                        debug_level = os.environ.get(
                            'RADICAL_PILOT_AGENT_VERBOSE', logger.level)
                        try:
                            debug_level = int(debug_level)
                        except ValueError:
                            debug_level = {
                                'CRITICAL': 1,
                                'ERROR': 2,
                                'WARNING': 3,
                                'WARN': 3,
                                'INFO': 4,
                                'DEBUG': 5
                            }.get(debug_level, 0)

                        if not queue:
                            queue = default_queue

                        if cleanup and isinstance(cleanup, bool):
                            cleanup = 'luve'  #  l : log files
                            #  u : unit work dirs
                            #  v : virtualenv
                            #  e : everything (== pilot sandbox)
                            #
                            # we never cleanup virtenvs which are not private
                            if virtenv_mode is not 'private':
                                cleanup = cleanup.replace('v', '')

                        sdists = ':'.join(
                            [ru.sdist_name, saga.sdist_name, sdist_name])

                        # set mandatory args
                        bootstrap_args = ""
                        bootstrap_args += " -b '%s'" % sdists
                        bootstrap_args += " -c '%s'" % number_cores
                        bootstrap_args += " -d '%s'" % debug_level
                        bootstrap_args += " -g '%s'" % virtenv
                        bootstrap_args += " -j '%s'" % task_launch_method
                        bootstrap_args += " -k '%s'" % mpi_launch_method
                        bootstrap_args += " -l '%s'" % lrms
                        bootstrap_args += " -m '%s'" % database_hostport
                        bootstrap_args += " -n '%s'" % database_name
                        bootstrap_args += " -o '%s'" % agent_spawner
                        bootstrap_args += " -p '%s'" % pilot_id
                        bootstrap_args += " -q '%s'" % agent_scheduler
                        bootstrap_args += " -r '%s'" % runtime
                        bootstrap_args += " -s '%s'" % session_uid
                        bootstrap_args += " -t '%s'" % agent_type
                        bootstrap_args += " -u '%s'" % virtenv_mode
                        bootstrap_args += " -v '%s'" % rp_version

                        # set optional args
                        if database_auth:
                            bootstrap_args += " -a '%s'" % database_auth
                        if tunnel_bind_device:
                            bootstrap_args += " -D '%s'" % tunnel_bind_device
                        if pre_bootstrap:
                            bootstrap_args += " -e '%s'" % "' -e '".join(
                                pre_bootstrap)
                        if forward_tunnel_endpoint:
                            bootstrap_args += " -f '%s'" % forward_tunnel_endpoint
                        if python_interpreter:
                            bootstrap_args += " -i '%s'" % python_interpreter
                        if cleanup:
                            bootstrap_args += " -x '%s'" % cleanup

                        # ------------------------------------------------------
                        # now that the script is in place and we know where it is,
                        # we can launch the agent
                        js_url = saga.Url(js_endpoint)
                        logger.debug("saga.job.Service ('%s')" % js_url)
                        if js_url in self._shared_worker_data['job_services']:
                            js = self._shared_worker_data['job_services'][
                                js_url]
                        else:
                            js = saga.job.Service(js_url,
                                                  session=self._session)
                            self._shared_worker_data['job_services'][
                                js_url] = js

                        # ------------------------------------------------------
                        # Create SAGA Job description and submit the pilot job

                        jd = saga.job.Description()

                        jd.executable = "/bin/bash"
                        jd.arguments = [
                            "-l pilot_bootstrapper.sh", bootstrap_args
                        ]
                        jd.working_directory = saga.Url(pilot_sandbox).path
                        jd.project = project
                        jd.output = "agent.out"
                        jd.error = "agent.err"
                        jd.total_cpu_count = number_cores
                        jd.wall_time_limit = runtime
                        jd.total_physical_memory = memory
                        jd.queue = queue

                        # Set the SPMD variation only if required
                        if spmd_variation:
                            jd.spmd_variation = spmd_variation

                        if 'RADICAL_PILOT_PROFILE' in os.environ:
                            jd.environment = {'RADICAL_PILOT_PROFILE': 'TRUE'}

                        logger.debug("Bootstrap command line: %s %s" %
                                     (jd.executable, jd.arguments))

                        msg = "Submitting SAGA job with description: %s" % str(
                            jd.as_dict())
                        logentries.append(Logentry(msg, logger=logger.debug))

                        pilotjob = js.create_job(jd)
                        pilotjob.run()

                        # do a quick error check
                        if pilotjob.state == saga.FAILED:
                            raise RuntimeError("SAGA Job state is FAILED.")

                        saga_job_id = pilotjob.id
                        self._shared_worker_data['job_ids'][pilot_id] = [
                            saga_job_id, js_url
                        ]

                        msg = "SAGA job submitted with job id %s" % str(
                            saga_job_id)
                        logentries.append(Logentry(msg, logger=logger.debug))

                        #
                        # ------------------------------------------------------

                        log_dicts = list()
                        for le in logentries:
                            log_dicts.append(le.as_dict())

                        # Update the Pilot's state to 'PENDING_ACTIVE' if SAGA job submission was successful.
                        ts = datetime.datetime.utcnow()
                        ret = pilot_col.update(
                            {
                                "_id": pilot_id,
                                "state": 'Launching'
                            }, {
                                "$set": {
                                    "state": PENDING_ACTIVE,
                                    "saga_job_id": saga_job_id
                                },
                                "$push": {
                                    "statehistory": {
                                        "state": PENDING_ACTIVE,
                                        "timestamp": ts
                                    }
                                },
                                "$pushAll": {
                                    "log": log_dicts
                                }
                            })

                        if ret['n'] == 0:
                            # could not update, probably because the agent is
                            # running already.  Just update state history and
                            # jobid then
                            # FIXME: make sure of the agent state!
                            ret = pilot_col.update({"_id": pilot_id}, {
                                "$set": {
                                    "saga_job_id": saga_job_id
                                },
                                "$push": {
                                    "statehistory": {
                                        "state": PENDING_ACTIVE,
                                        "timestamp": ts
                                    }
                                },
                                "$pushAll": {
                                    "log": log_dicts
                                }
                            })

                    except Exception as e:
                        # Update the Pilot's state 'FAILED'.
                        out, err, log = self._get_pilot_logs(
                            pilot_col, pilot_id)
                        ts = datetime.datetime.utcnow()

                        # FIXME: we seem to be unable to bson/json handle saga
                        # log messages containing an '#'.  This shows up here.
                        # Until we find a clean workaround, make log shorter and
                        # rely on saga logging to reveal the problem.
                        msg = "Pilot launching failed! (%s)" % e
                        logentries.append(Logentry(msg))

                        log_dicts = list()
                        log_messages = list()
                        for le in logentries:
                            log_dicts.append(le.as_dict())
                            log_messages.append(le.message)

                        pilot_col.update(
                            {
                                "_id": pilot_id,
                                "state": {
                                    "$ne": FAILED
                                }
                            }, {
                                "$set": {
                                    "state": FAILED,
                                    "stdout": out,
                                    "stderr": err,
                                    "logfile": log
                                },
                                "$push": {
                                    "statehistory": {
                                        "state": FAILED,
                                        "timestamp": ts
                                    }
                                },
                                "$pushAll": {
                                    "log": log_dicts
                                }
                            })
                        logger.exception('\n'.join(log_messages))

        except SystemExit as e:
            logger.exception(
                "pilot launcher thread caught system exit -- forcing application shutdown"
            )
            import thread
            thread.interrupt_main()
    def check_pilot_states(self, pilot_col):

        pending_pilots = pilot_col.find({
            "pilotmanager": self.pilot_manager_id,
            "state": {
                "$in": [PENDING_ACTIVE, ACTIVE]
            }
        })

        for pending_pilot in pending_pilots:

            pilot_failed = False
            pilot_done = False
            reconnected = False
            pilot_id = pending_pilot["_id"]
            log_message = ""
            saga_job_id = pending_pilot["saga_job_id"]

            logger.info(
                "Performing periodical health check for %s (SAGA job id %s)" %
                (str(pilot_id), saga_job_id))

            if not pilot_id in self.missing_pilots:
                self.missing_pilots[pilot_id] = 0

            # Create a job service object:
            try:
                js_url = saga_job_id.split("]-[")[0][1:]

                if js_url in self._shared_worker_data['job_services']:
                    js = self._shared_worker_data['job_services'][js_url]
                else:
                    js = saga.job.Service(js_url, session=self._session)
                    self._shared_worker_data['job_services'][js_url] = js

                saga_job = js.get_job(saga_job_id)
                reconnected = True

                if saga_job.state in [saga.job.FAILED, saga.job.CANCELED]:
                    pilot_failed = True
                    log_message  = "SAGA job state for ComputePilot %s is %s."\
                                 % (pilot_id, saga_job.state)

                if saga_job.state in [saga.job.DONE]:
                    pilot_done = True
                    log_message  = "SAGA job state for ComputePilot %s is %s."\
                                 % (pilot_id, saga_job.state)

            except Exception as e:

                if not reconnected:
                    logger.warning(
                        'could not reconnect to pilot for state check (%s)' %
                        e)
                    self.missing_pilots[pilot_id] += 1

                    if self.missing_pilots[pilot_id] >= JOB_CHECK_MAX_MISSES:
                        logger.debug('giving up after 10 attempts')
                        pilot_failed = True
                        log_message  = "Could not reconnect to pilot %s "\
                                       "multiple times - giving up" % pilot_id
                else:
                    logger.warning('pilot state check failed: %s' % e)
                    pilot_failed = True
                    log_message  = "Couldn't determine job state for ComputePilot %s. " \
                                   "Assuming it has failed." % pilot_id

            if pilot_failed:
                out, err, log = self._get_pilot_logs(pilot_col, pilot_id)
                ts = datetime.datetime.utcnow()
                pilot_col.update({
                    "_id": pilot_id,
                    "state": {
                        "$ne": DONE
                    }
                }, {
                    "$set": {
                        "state": FAILED,
                        "stdout": out,
                        "stderr": err,
                        "logfile": log
                    },
                    "$push": {
                        "statehistory": {
                            "state": FAILED,
                            "timestamp": ts
                        },
                        "log": {
                            "message": log_message,
                            "timestamp": ts
                        }
                    }
                })
                logger.debug(log_message)
                logger.warn('pilot %s declared dead' % pilot_id)

            elif pilot_done:
                # FIXME: this should only be done if the state is not yet
                # done...
                out, err, log = self._get_pilot_logs(pilot_col, pilot_id)
                ts = datetime.datetime.utcnow()
                pilot_col.update({
                    "_id": pilot_id,
                    "state": {
                        "$ne": DONE
                    }
                }, {
                    "$set": {
                        "state": DONE,
                        "stdout": out,
                        "stderr": err,
                        "logfile": log
                    },
                    "$push": {
                        "statehistory": {
                            "state": DONE,
                            "timestamp": ts
                        },
                        "log": {
                            "message": log_message,
                            "timestamp": ts
                        }
                    }
                })
                logger.debug(log_message)
                logger.warn('pilot %s declared dead' % pilot_id)

            else:
                if self.missing_pilots[pilot_id]:
                    logger.info ('pilot %s *assumed* alive and well (%s)' \
                              % (pilot_id, self.missing_pilots[pilot_id]))
                else:
                    logger.info ('pilot %s seems alive and well' \
                              % (pilot_id))
Example #12
0
    def schedule_compute_units(self, pilot_uid, units):
        """Request the scheduling of one or more ComputeUnits on a
           ComputePilot.
        """

        try:
            cu_transfer = list()
            cu_notransfer = list()

            # Get some information about the pilot sandbox from the database.
            pilot_info = self._db.get_pilots(pilot_ids=pilot_uid)
            # TODO: this hack below relies on what?! That there is just one pilot?
            pilot_sandbox = pilot_info[0]['sandbox']

            # Split units into two different lists: the first list contains the CUs
            # that need file transfer and the second list contains the CUs that
            # don't. The latter is added to the pilot directly, while the former
            # is added to the transfer queue.
            for unit in units:

                # Create object for staging status tracking
                unit.FTW_Input_Status = None
                unit.FTW_Input_Directives = []
                unit.Agent_Input_Status = None
                unit.Agent_Input_Directives = []
                unit.FTW_Output_Status = None
                unit.FTW_Output_Directives = []
                unit.Agent_Output_Status = None
                unit.Agent_Output_Directives = []

                # Split the input staging directives over the transfer worker and the agent
                input_sds = unit.description.input_staging
                if not isinstance(input_sds, list):
                    # Ugly, but is a workaround for iterating on attribute interface
                    # TODO: Verify if this piece of code is actually still required
                    if input_sds:
                        input_sds = [input_sds]
                    else:
                        input_sds = []

                for input_sd_entry in input_sds:
                    action = input_sd_entry['action']
                    source = Url(input_sd_entry['source'])
                    target = Url(input_sd_entry['target'])

                    new_sd = {
                        'action': action,
                        'source': str(source),
                        'target': str(target),
                        'flags': input_sd_entry['flags'],
                        'priority': input_sd_entry['priority'],
                        'state': PENDING
                    }

                    if action in [LINK, COPY, MOVE]:
                        unit.Agent_Input_Directives.append(new_sd)
                        unit.Agent_Input_Status = PENDING
                    elif action in [TRANSFER]:
                        if source.scheme and source.scheme != 'file':
                            # If there is a scheme and it is different than "file",
                            # assume a remote pull from the agent
                            unit.Agent_Input_Directives.append(new_sd)
                            unit.Agent_Input_Status = PENDING
                        else:
                            # Transfer from local to sandbox
                            unit.FTW_Input_Directives.append(new_sd)
                            unit.FTW_Input_Status = PENDING
                    else:
                        logger.warn(
                            'Not sure if action %s makes sense for input staging'
                            % action)

                # Split the output staging directives over the transfer worker and the agent
                output_sds = unit.description.output_staging
                if not isinstance(output_sds, list):
                    # Ugly, but is a workaround for iterating on att iface
                    # TODO: Verify if this piece of code is actually still required
                    if output_sds:
                        output_sds = [output_sds]
                    else:
                        output_sds = []

                for output_sds_entry in output_sds:
                    action = output_sds_entry['action']
                    source = Url(output_sds_entry['source'])
                    target = Url(output_sds_entry['target'])

                    new_sd = {
                        'action': action,
                        'source': str(source),
                        'target': str(target),
                        'flags': output_sds_entry['flags'],
                        'priority': output_sds_entry['priority'],
                        'state': PENDING
                    }

                    if action == LINK or action == COPY or action == MOVE:
                        unit.Agent_Output_Directives.append(new_sd)
                        unit.Agent_Output_Status = NEW
                    elif action == TRANSFER:
                        if target.scheme and target.scheme != 'file':
                            # If there is a scheme and it is different than "file",
                            # assume a remote push from the agent
                            unit.Agent_Output_Directives.append(new_sd)
                            unit.Agent_Output_Status = NEW
                        else:
                            # Transfer from sandbox back to local
                            unit.FTW_Output_Directives.append(new_sd)
                            unit.FTW_Output_Status = NEW
                    else:
                        logger.warn(
                            'Not sure if action %s makes sense for output staging'
                            % action)

                if unit.FTW_Input_Directives or unit.Agent_Input_Directives:
                    log = "Scheduled for data transfer to ComputePilot %s." % pilot_uid
                    self._db.set_compute_unit_state(unit.uid,
                                                    PENDING_INPUT_STAGING, log)
                    cu_transfer.append(unit)
                else:
                    cu_notransfer.append(unit)

            # Bulk-add all non-transfer units-
            self._db.assign_compute_units_to_pilot(units=cu_notransfer,
                                                   pilot_uid=pilot_uid,
                                                   pilot_sandbox=pilot_sandbox)

            self._db.assign_compute_units_to_pilot(units=cu_transfer,
                                                   pilot_uid=pilot_uid,
                                                   pilot_sandbox=pilot_sandbox)

            for unit in cu_notransfer:
                log = "Scheduled for execution on ComputePilot %s." % pilot_uid
                self._db.set_compute_unit_state(unit.uid, PENDING_EXECUTION,
                                                log)
                #self._set_state(uid, PENDING_EXECUTION, log)

            logger.info(
                "Scheduled ComputeUnits %s for execution on ComputePilot '%s'."
                % (cu_notransfer, pilot_uid))
        except Exception, e:
            logger.exception('error in unit manager controller (schedule())')
            raise
    def run(self):
        """Starts the process when Process.start() is called.
        """

        # make sure to catch sys.exit (which raises SystemExit)
        try:
            # Get directory where this module lives
            mod_dir = os.path.dirname(os.path.realpath(__file__))

            # Try to connect to the database
            try:
                connection = self.db_connection_info.get_db_handle()
                db = connection[self.db_connection_info.dbname]
                pilot_col = db["%s.p" % self.db_connection_info.session_id]
                logger.debug("Connected to MongoDB. Serving requests for PilotManager %s." % self.pilot_manager_id)

            except Exception as e:
                logger.exception("Connection error: %s" % e)
                return

            last_job_check = time.time()

            while not self._stop.is_set():

                # Periodically, we pull up all ComputePilots that are pending
                # execution or were last seen executing and check if the corresponding
                # SAGA job is still pending in the queue. If that is not the case,
                # we assume that the job has failed for some reasons and update
                # the state of the ComputePilot accordingly.
                if last_job_check + JOB_CHECK_INTERVAL < time.time():
                    last_job_check = time.time()
                    self.check_pilot_states(pilot_col)

                # See if we can find a ComputePilot that is waiting to be launched.
                # If we find one, we use SAGA to create a job service, a job
                # description and a job that is then send to the local or remote
                # queueing system. If this succedes, we set the ComputePilot's
                # state to pending, otherwise to failed.
                compute_pilot = None

                ts = datetime.datetime.utcnow()
                compute_pilot = pilot_col.find_and_modify(
                    query={"pilotmanager": self.pilot_manager_id, "state": PENDING_LAUNCH},
                    update={
                        "$set": {"state": LAUNCHING},
                        "$push": {"statehistory": {"state": LAUNCHING, "timestamp": ts}},
                    },
                )

                if not compute_pilot:
                    time.sleep(IDLE_TIMER)

                else:
                    try:
                        # ------------------------------------------------------
                        #
                        # LAUNCH THE PILOT AGENT VIA SAGA
                        #
                        logentries = []
                        pilot_id = str(compute_pilot["_id"])

                        logger.info("Launching ComputePilot %s" % pilot_id)

                        # ------------------------------------------------------
                        # Database connection parameters
                        session_uid = self.db_connection_info.session_id
                        database_url = self.db_connection_info.dburl
                        database_name = self.db_connection_info.dbname
                        database_auth = self.db_connection_info.dbauth

                        # ------------------------------------------------------
                        # pilot description and resource configuration
                        number_cores = compute_pilot["description"]["cores"]
                        runtime = compute_pilot["description"]["runtime"]
                        queue = compute_pilot["description"]["queue"]
                        project = compute_pilot["description"]["project"]
                        cleanup = compute_pilot["description"]["cleanup"]
                        resource_key = compute_pilot["description"]["resource"]
                        schema = compute_pilot["description"]["access_schema"]
                        memory = compute_pilot["description"]["memory"]
                        pilot_sandbox = compute_pilot["sandbox"]
                        global_sandbox = compute_pilot["global_sandbox"]

                        # we expand and exchange keys in the resource config,
                        # depending on the selected schema so better use a deep
                        # copy..
                        resource_cfg = self._session.get_resource_config(resource_key, schema)

                        # import pprint
                        # pprint.pprint (resource_cfg)

                        # ------------------------------------------------------
                        # get parameters from cfg, set defaults where needed
                        agent_mongodb_endpoint = resource_cfg.get("agent_mongodb_endpoint", database_url)
                        agent_spawner = resource_cfg.get("agent_spawner", DEFAULT_AGENT_SPAWNER)
                        agent_type = resource_cfg.get("agent_type", DEFAULT_AGENT_TYPE)
                        agent_scheduler = resource_cfg.get("agent_scheduler")
                        tunnel_bind_device = resource_cfg.get("tunnel_bind_device")
                        default_queue = resource_cfg.get("default_queue")
                        forward_tunnel_endpoint = resource_cfg.get("forward_tunnel_endpoint")
                        js_endpoint = resource_cfg.get("job_manager_endpoint")
                        lrms = resource_cfg.get("lrms")
                        mpi_launch_method = resource_cfg.get("mpi_launch_method")
                        pre_bootstrap = resource_cfg.get("pre_bootstrap")
                        python_interpreter = resource_cfg.get("python_interpreter")
                        spmd_variation = resource_cfg.get("spmd_variation")
                        task_launch_method = resource_cfg.get("task_launch_method")
                        rp_version = resource_cfg.get("rp_version", DEFAULT_RP_VERSION)
                        virtenv_mode = resource_cfg.get("virtenv_mode", DEFAULT_VIRTENV_MODE)
                        virtenv = resource_cfg.get("virtenv", DEFAULT_VIRTENV)
                        stage_cacerts = resource_cfg.get("stage_cacerts", "False")

                        if stage_cacerts.lower() == "true":
                            stage_cacerts = True
                        else:
                            stage_cacerts = False

                        # expand variables in virtenv string
                        virtenv = virtenv % {
                            "pilot_sandbox": saga.Url(pilot_sandbox).path,
                            "global_sandbox": saga.Url(global_sandbox).path,
                        }

                        # Check for deprecated global_virtenv
                        global_virtenv = resource_cfg.get("global_virtenv")
                        if global_virtenv:
                            logger.warn("'global_virtenv' keyword is deprecated -- use 'virtenv' and 'virtenv_mode'")
                            virtenv = global_virtenv
                            virtenv_mode = "use"

                        # set default scheme, host, port and dbname if not set
                        db_url = saga.Url(agent_mongodb_endpoint)
                        if not db_url.scheme:
                            db_url.scheme = "mongodb"
                        if not db_url.host:
                            db_url.host = "localhost"
                        if not db_url.port:
                            db_url.port = 27017
                        if not database_name:
                            database_name = "radicalpilot"

                        # Create a host:port string for use by the bootstrapper.
                        database_hostport = "%s:%d" % (db_url.host, db_url.port)

                        # ------------------------------------------------------
                        # Copy the bootstrap shell script.  This also creates
                        # the sandbox. We use always "default_bootstrapper.sh"
                        bootstrapper = "default_bootstrapper.sh"
                        bootstrapper_path = os.path.abspath("%s/../bootstrapper/%s" % (mod_dir, bootstrapper))

                        msg = "Using bootstrapper %s" % bootstrapper_path
                        logentries.append(Logentry(msg, logger=logger.info))

                        bs_script_url = saga.Url("file://localhost/%s" % bootstrapper_path)
                        bs_script_tgt = saga.Url("%s/pilot_bootstrapper.sh" % pilot_sandbox)

                        msg = "Copying bootstrapper '%s' to agent sandbox (%s)." % (bs_script_url, bs_script_tgt)
                        logentries.append(Logentry(msg, logger=logger.debug))

                        bs_script = saga.filesystem.File(bs_script_url, session=self._session)
                        bs_script.copy(bs_script_tgt, flags=saga.filesystem.CREATE_PARENTS)
                        bs_script.close()

                        # ------------------------------------------------------
                        # the version of the agent is derived from
                        # rp_version, which has the following format
                        # and interpretation:
                        #
                        # case rp_version:
                        #   @<token>:
                        #   @tag/@branch/@commit: # no sdist staging
                        #       git clone $github_base radical.pilot.src
                        #       (cd radical.pilot.src && git checkout token)
                        #       pip install -t $VIRTENV/rp_install/ radical.pilot.src
                        #       rm -rf radical.pilot.src
                        #       export PYTHONPATH=$VIRTENV/rp_install:$PYTHONPATH
                        #
                        #   release: # no sdist staging
                        #       pip install -t $VIRTENV/rp_install radical.pilot
                        #       export PYTHONPATH=$VIRTENV/rp_install:$PYTHONPATH
                        #
                        #   local: # needs sdist staging
                        #       tar zxf $sdist.tgz
                        #       pip install -t $VIRTENV/rp_install $sdist/
                        #       export PYTHONPATH=$VIRTENV/rp_install:$PYTHONPATH
                        #
                        #   debug: # needs sdist staging
                        #       tar zxf $sdist.tgz
                        #       pip install -t $SANDBOX/rp_install $sdist/
                        #       export PYTHONPATH=$SANDBOX/rp_install:$PYTHONPATH
                        #
                        #   installed: # no sdist staging
                        #       true
                        # esac
                        #
                        # virtenv_mode
                        #   private : error  if ve exists, otherwise create, then use
                        #   update  : update if ve exists, otherwise create, then use
                        #   create  : use    if ve exists, otherwise create, then use
                        #   use     : use    if ve exists, otherwise error,  then exit
                        #   recreate: delete if ve exists, otherwise create, then use
                        #
                        # examples   :
                        #   [email protected]
                        #   virtenv@devel
                        #   virtenv@release
                        #   virtenv@installed
                        #   stage@local
                        #   stage@/tmp/my_agent.py
                        #
                        # Note that some combinations may be invalid,
                        # specifically in the context of virtenv_mode.  If, for
                        # example, virtenv_mode is 'use', then the 'virtenv:tag'
                        # will not make sense, as the virtenv is not updated.
                        # In those cases, the virtenv_mode is honored, and
                        # a warning is printed.
                        #
                        # Also, the 'stage' mode can only be combined with the
                        # 'local' source, or with a path to the agent (relative
                        # to mod_dir, or absolute).
                        #
                        # A rp_version which does not adhere to the
                        # above syntax is ignored, and the fallback stage@local
                        # is used.

                        if not rp_version.startswith("@") and not rp_version in ["installed", "local", "debug"]:
                            raise ValueError("invalid rp_version '%s'" % rp_version)

                        stage_sdist = True
                        if rp_version in ["installed", "release"]:
                            stage_sdist = False

                        if rp_version.startswith("@"):
                            stage_sdist = False
                            rp_version = rp_version[1:]  # strip '@'

                        # ------------------------------------------------------
                        # Copy the rp sdist if needed.  We actually also stage
                        # the sdists for radical.utils and radical.saga, so that
                        # we have the complete stack to install...
                        if stage_sdist:

                            for path in [ru.sdist_path, saga.sdist_path, sdist_path]:

                                sdist_url = saga.Url("file://localhost/%s" % path)
                                msg = "Copying sdist '%s' to sdist sandbox (%s)." % (sdist_url, pilot_sandbox)
                                logentries.append(Logentry(msg, logger=logger.debug))

                                sdist_file = saga.filesystem.File(sdist_url)
                                sdist_file.copy("%s/" % (str(pilot_sandbox)))
                                sdist_file.close()

                        # ------------------------------------------------------
                        # some machines cannot run pip due to outdated ca certs.
                        # For those, we also stage an updated cert bundle
                        if stage_cacerts:
                            cc_path = os.path.abspath("%s/../bootstrapper/%s" % (mod_dir, "cacert.pem.gz"))

                            cc_script_url = saga.Url("file://localhost/%s" % cc_path)
                            cc_script_tgt = saga.Url("%s/cacert.pem.gz" % pilot_sandbox)

                            cc_script = saga.filesystem.File(cc_script_url, session=self._session)
                            cc_script.copy(cc_script_tgt, flags=saga.filesystem.CREATE_PARENTS)
                            cc_script.close()

                        # ------------------------------------------------------
                        # sanity checks
                        if not agent_spawner:
                            raise RuntimeError("missing agent spawner")
                        if not agent_scheduler:
                            raise RuntimeError("missing agent scheduler")
                        if not lrms:
                            raise RuntimeError("missing LRMS")
                        if not mpi_launch_method:
                            raise RuntimeError("missing mpi launch method")
                        if not task_launch_method:
                            raise RuntimeError("missing task launch method")

                        # massage some values
                        debug_level = os.environ.get("RADICAL_PILOT_AGENT_VERBOSE", logger.level)
                        try:
                            debug_level = int(debug_level)
                        except ValueError:
                            debug_level = {
                                "CRITICAL": 1,
                                "ERROR": 2,
                                "WARNING": 3,
                                "WARN": 3,
                                "INFO": 4,
                                "DEBUG": 5,
                            }.get(debug_level, 0)

                        if not queue:
                            queue = default_queue

                        if cleanup and isinstance(cleanup, bool):
                            cleanup = "luve"  #  l : log files
                            #  u : unit work dirs
                            #  v : virtualenv
                            #  e : everything (== pilot sandbox)
                            #
                            # we never cleanup virtenvs which are not private
                            if virtenv_mode is not "private":
                                cleanup = cleanup.replace("v", "")

                        sdists = ":".join([ru.sdist_name, saga.sdist_name, sdist_name])

                        # set mandatory args
                        bootstrap_args = ""
                        bootstrap_args += " -b '%s'" % sdists
                        bootstrap_args += " -c '%s'" % number_cores
                        bootstrap_args += " -d '%s'" % debug_level
                        bootstrap_args += " -g '%s'" % virtenv
                        bootstrap_args += " -j '%s'" % task_launch_method
                        bootstrap_args += " -k '%s'" % mpi_launch_method
                        bootstrap_args += " -l '%s'" % lrms
                        bootstrap_args += " -m '%s'" % database_hostport
                        bootstrap_args += " -n '%s'" % database_name
                        bootstrap_args += " -o '%s'" % agent_spawner
                        bootstrap_args += " -p '%s'" % pilot_id
                        bootstrap_args += " -q '%s'" % agent_scheduler
                        bootstrap_args += " -r '%s'" % runtime
                        bootstrap_args += " -s '%s'" % session_uid
                        bootstrap_args += " -t '%s'" % agent_type
                        bootstrap_args += " -u '%s'" % virtenv_mode
                        bootstrap_args += " -v '%s'" % rp_version

                        # set optional args
                        if database_auth:
                            bootstrap_args += " -a '%s'" % database_auth
                        if tunnel_bind_device:
                            bootstrap_args += " -D '%s'" % tunnel_bind_device
                        if pre_bootstrap:
                            bootstrap_args += " -e '%s'" % "' -e '".join(pre_bootstrap)
                        if forward_tunnel_endpoint:
                            bootstrap_args += " -f '%s'" % forward_tunnel_endpoint
                        if python_interpreter:
                            bootstrap_args += " -i '%s'" % python_interpreter
                        if cleanup:
                            bootstrap_args += " -x '%s'" % cleanup

                        # ------------------------------------------------------
                        # now that the script is in place and we know where it is,
                        # we can launch the agent
                        js_url = saga.Url(js_endpoint)
                        logger.debug("saga.job.Service ('%s')" % js_url)
                        if js_url in self._shared_worker_data["job_services"]:
                            js = self._shared_worker_data["job_services"][js_url]
                        else:
                            js = saga.job.Service(js_url, session=self._session)
                            self._shared_worker_data["job_services"][js_url] = js

                        # ------------------------------------------------------
                        # Create SAGA Job description and submit the pilot job

                        jd = saga.job.Description()

                        jd.executable = "/bin/bash"
                        jd.arguments = ["-l pilot_bootstrapper.sh", bootstrap_args]
                        jd.working_directory = saga.Url(pilot_sandbox).path
                        jd.project = project
                        jd.output = "agent.out"
                        jd.error = "agent.err"
                        jd.total_cpu_count = number_cores
                        jd.wall_time_limit = runtime
                        jd.total_physical_memory = memory
                        jd.queue = queue

                        # Set the SPMD variation only if required
                        if spmd_variation:
                            jd.spmd_variation = spmd_variation

                        if "RADICAL_PILOT_PROFILE" in os.environ:
                            jd.environment = {"RADICAL_PILOT_PROFILE": "TRUE"}

                        logger.debug("Bootstrap command line: %s %s" % (jd.executable, jd.arguments))

                        msg = "Submitting SAGA job with description: %s" % str(jd.as_dict())
                        logentries.append(Logentry(msg, logger=logger.debug))

                        pilotjob = js.create_job(jd)
                        pilotjob.run()

                        # do a quick error check
                        if pilotjob.state == saga.FAILED:
                            raise RuntimeError("SAGA Job state is FAILED.")

                        saga_job_id = pilotjob.id
                        self._shared_worker_data["job_ids"][pilot_id] = [saga_job_id, js_url]

                        msg = "SAGA job submitted with job id %s" % str(saga_job_id)
                        logentries.append(Logentry(msg, logger=logger.debug))

                        #
                        # ------------------------------------------------------

                        log_dicts = list()
                        for le in logentries:
                            log_dicts.append(le.as_dict())

                        # Update the Pilot's state to 'PENDING_ACTIVE' if SAGA job submission was successful.
                        ts = datetime.datetime.utcnow()
                        ret = pilot_col.update(
                            {"_id": pilot_id, "state": "Launching"},
                            {
                                "$set": {"state": PENDING_ACTIVE, "saga_job_id": saga_job_id},
                                "$push": {"statehistory": {"state": PENDING_ACTIVE, "timestamp": ts}},
                                "$pushAll": {"log": log_dicts},
                            },
                        )

                        if ret["n"] == 0:
                            # could not update, probably because the agent is
                            # running already.  Just update state history and
                            # jobid then
                            # FIXME: make sure of the agent state!
                            ret = pilot_col.update(
                                {"_id": pilot_id},
                                {
                                    "$set": {"saga_job_id": saga_job_id},
                                    "$push": {"statehistory": {"state": PENDING_ACTIVE, "timestamp": ts}},
                                    "$pushAll": {"log": log_dicts},
                                },
                            )

                    except Exception as e:
                        # Update the Pilot's state 'FAILED'.
                        out, err, log = self._get_pilot_logs(pilot_col, pilot_id)
                        ts = datetime.datetime.utcnow()

                        # FIXME: we seem to be unable to bson/json handle saga
                        # log messages containing an '#'.  This shows up here.
                        # Until we find a clean workaround, make log shorter and
                        # rely on saga logging to reveal the problem.
                        msg = "Pilot launching failed! (%s)" % e
                        logentries.append(Logentry(msg))

                        log_dicts = list()
                        log_messages = list()
                        for le in logentries:
                            log_dicts.append(le.as_dict())
                            log_messages.append(le.message)

                        pilot_col.update(
                            {"_id": pilot_id, "state": {"$ne": FAILED}},
                            {
                                "$set": {"state": FAILED, "stdout": out, "stderr": err, "logfile": log},
                                "$push": {"statehistory": {"state": FAILED, "timestamp": ts}},
                                "$pushAll": {"log": log_dicts},
                            },
                        )
                        logger.exception("\n".join(log_messages))

        except SystemExit as e:
            logger.exception("pilot launcher thread caught system exit -- forcing application shutdown")
            import thread

            thread.interrupt_main()
    def check_pilot_states(self, pilot_col):

        pending_pilots = pilot_col.find(
            {"pilotmanager": self.pilot_manager_id, "state": {"$in": [PENDING_ACTIVE, ACTIVE]}}
        )

        for pending_pilot in pending_pilots:

            pilot_failed = False
            pilot_done = False
            reconnected = False
            pilot_id = pending_pilot["_id"]
            log_message = ""
            saga_job_id = pending_pilot["saga_job_id"]

            logger.info("Performing periodical health check for %s (SAGA job id %s)" % (str(pilot_id), saga_job_id))

            if not pilot_id in self.missing_pilots:
                self.missing_pilots[pilot_id] = 0

            # Create a job service object:
            try:
                js_url = saga_job_id.split("]-[")[0][1:]

                if js_url in self._shared_worker_data["job_services"]:
                    js = self._shared_worker_data["job_services"][js_url]
                else:
                    js = saga.job.Service(js_url, session=self._session)
                    self._shared_worker_data["job_services"][js_url] = js

                saga_job = js.get_job(saga_job_id)
                reconnected = True

                if saga_job.state in [saga.job.FAILED, saga.job.CANCELED]:
                    pilot_failed = True
                    log_message = "SAGA job state for ComputePilot %s is %s." % (pilot_id, saga_job.state)

                if saga_job.state in [saga.job.DONE]:
                    pilot_done = True
                    log_message = "SAGA job state for ComputePilot %s is %s." % (pilot_id, saga_job.state)

            except Exception as e:

                if not reconnected:
                    logger.warning("could not reconnect to pilot for state check (%s)" % e)
                    self.missing_pilots[pilot_id] += 1

                    if self.missing_pilots[pilot_id] >= JOB_CHECK_MAX_MISSES:
                        logger.debug("giving up after 10 attempts")
                        pilot_failed = True
                        log_message = "Could not reconnect to pilot %s " "multiple times - giving up" % pilot_id
                else:
                    logger.warning("pilot state check failed: %s" % e)
                    pilot_failed = True
                    log_message = (
                        "Couldn't determine job state for ComputePilot %s. " "Assuming it has failed." % pilot_id
                    )

            if pilot_failed:
                out, err, log = self._get_pilot_logs(pilot_col, pilot_id)
                ts = datetime.datetime.utcnow()
                pilot_col.update(
                    {"_id": pilot_id, "state": {"$ne": DONE}},
                    {
                        "$set": {"state": FAILED, "stdout": out, "stderr": err, "logfile": log},
                        "$push": {
                            "statehistory": {"state": FAILED, "timestamp": ts},
                            "log": {"message": log_message, "timestamp": ts},
                        },
                    },
                )
                logger.debug(log_message)
                logger.warn("pilot %s declared dead" % pilot_id)

            elif pilot_done:
                # FIXME: this should only be done if the state is not yet
                # done...
                out, err, log = self._get_pilot_logs(pilot_col, pilot_id)
                ts = datetime.datetime.utcnow()
                pilot_col.update(
                    {"_id": pilot_id, "state": {"$ne": DONE}},
                    {
                        "$set": {"state": DONE, "stdout": out, "stderr": err, "logfile": log},
                        "$push": {
                            "statehistory": {"state": DONE, "timestamp": ts},
                            "log": {"message": log_message, "timestamp": ts},
                        },
                    },
                )
                logger.debug(log_message)
                logger.warn("pilot %s declared dead" % pilot_id)

            else:
                if self.missing_pilots[pilot_id]:
                    logger.info("pilot %s *assumed* alive and well (%s)" % (pilot_id, self.missing_pilots[pilot_id]))
                else:
                    logger.info("pilot %s seems alive and well" % (pilot_id))