def unschedule(self, units): """Unschedule one or more ComputeUnits""" logger.warn("scheduler %s does not implement 'unschedule()'" % self.name)
def pilot_added(self, pilot): """Inform the scheduler about a new pilot""" logger.warn("scheduler %s does not implement 'pilot_added()'" % self.name)
def pilot_removed(self, pilot): """Inform the scheduler about a pilot removal""" logger.warn("scheduler %s does not implement 'pilot_removed()'" % self.name)
def register_cancel_pilots_request(self, pilot_ids=None): """Registers one or more pilots for cancelation. """ if pilot_ids is None: pilot_ids = list() for pilot in self._db.get_pilots(pilot_manager_id=self._pm_id) : pilot_ids.append (str(pilot["_id"])) self._db.send_command_to_pilot(COMMAND_CANCEL_PILOT, pilot_ids=pilot_ids) logger.info("Sent 'COMMAND_CANCEL_PILOT' command to pilots %s.", pilot_ids) # pilots which are in ACTIVE state should now have time to react on the # CANCEL command sent above. Meanwhile, we'll cancel all pending # pilots. If that is done, we wait a little, say 10 seconds, to give # the pilot time to pick up the request and shut down -- but if it does # not do that, it will get killed the hard way... delayed_cancel = list() for pilot_id in pilot_ids : if pilot_id in self._shared_data : # read state fomr _shared_data only once, so that it does not # change under us... old_state = str(self._shared_data[pilot_id]["data"]["state"]) logger.warn ("actively cancel pilot %s state: %s" % (pilot_id, old_state)) if old_state in [DONE, FAILED, CANCELED] : logger.warn ("can't actively cancel pilot %s: already in final state" % pilot_id) elif old_state in [PENDING_LAUNCH, LAUNCHING, PENDING_ACTIVE] : if pilot_id in self._shared_worker_data['job_ids'] : try : job_id, js_url = self._shared_worker_data['job_ids'][pilot_id] self._shared_data[pilot_id]["data"]["state"] = CANCELING logger.info ("actively cancel pilot %s (%s, %s)" % (pilot_id, job_id, js_url)) js = self._shared_worker_data['job_services'][js_url] job = js.get_job (job_id) job.cancel () except Exception as e : logger.exception ('pilot cancelation failed') else : logger.warn ("can't actively cancel pilot %s: no job id known" % pilot_id) logger.debug (pprint.pformat (self._shared_worker_data)) else : logger.debug ("delay to actively cancel pilot %s: state %s" % (pilot_id, old_state)) delayed_cancel.append (pilot_id) else : logger.warn ("can't actively cancel pilot %s: unknown pilot" % pilot_id) logger.debug (pprint.pformat (self._shared_data)) # now tend to all delayed cancellation requests (ie. active pilots) -- # if there are any if delayed_cancel : # grant some levay to the unruly children... time.sleep (10) for pilot_id in delayed_cancel : if pilot_id in self._shared_worker_data['job_ids'] : try : job_id, js_url = self._shared_worker_data['job_ids'][pilot_id] logger.info ("actively cancel pilot %s (delayed) (%s, %s)" % (pilot_id, job_id, js_url)) js = self._shared_worker_data['job_services'][js_url] job = js.get_job (job_id) job.cancel () except Exception as e : logger.warn ('delayed pilot cancelation failed. ' 'This is not necessarily a problem.') else : logger.warn ("can't actively cancel pilot %s: no job id known (delayed)" % pilot_id) logger.debug (pprint.pformat (self._shared_worker_data))
def handle_schedule (self, schedule) : # we want to use bulk submission to the pilots, so we collect all units # assigned to the same set of pilots. At the same time, we select # unscheduled units for later insertion into the wait queue. if not schedule : logger.debug ('skipping empty unit schedule') return # print 'handle schedule:' # import pprint # pprint.pprint (schedule) # pilot_cu_map = dict() unscheduled = list() pilot_ids = self.list_pilots () for unit in schedule['units'].keys() : pid = schedule['units'][unit] if None == pid : unscheduled.append (unit) continue else : if pid not in pilot_ids : raise RuntimeError ("schedule points to unknown pilot %s" % pid) if pid not in pilot_cu_map : pilot_cu_map[pid] = list() pilot_cu_map[pid].append (unit) # submit to all pilots which got something submitted to for pid in pilot_cu_map.keys(): units_to_schedule = list() # if a kernel name is in the cu descriptions set, do kernel expansion for unit in pilot_cu_map[pid] : if not pid in schedule['pilots'] : # lost pilot, do not schedule unit logger.warn ("unschedule unit %s, lost pilot %s" % (unit.uid, pid)) continue unit.sandbox = schedule['pilots'][pid]['sandbox'] + "/" + str(unit.uid) ud = unit.description if 'kernel' in ud and ud['kernel'] : try : from radical.ensemblemd.mdkernels import MDTaskDescription except Exception as ex : logger.error ("Kernels are not supported in" \ "compute unit descriptions -- install " \ "radical.ensemblemd.mdkernels!") # FIXME: unit needs a '_set_state() method or something! self._session._dbs.set_compute_unit_state (unit._uid, FAILED, ["kernel expansion failed"]) continue pilot_resource = schedule['pilots'][pid]['resource'] mdtd = MDTaskDescription () mdtd.kernel = ud.kernel mdtd_bound = mdtd.bind (resource=pilot_resource) ud.environment = mdtd_bound.environment ud.pre_exec = mdtd_bound.pre_exec ud.executable = mdtd_bound.executable ud.mpi = mdtd_bound.mpi units_to_schedule.append (unit) if len(units_to_schedule) : self._worker.schedule_compute_units (pilot_uid=pid, units=units_to_schedule) # report any change in wait_queue_size old_wait_queue_size = self.wait_queue_size self.wait_queue_size = len(unscheduled) if old_wait_queue_size != self.wait_queue_size : self._worker.fire_manager_callback (WAIT_QUEUE_SIZE, self, self.wait_queue_size) if len(unscheduled) : self._worker.unschedule_compute_units (units=unscheduled) logger.info ('%s units remain unscheduled' % len(unscheduled))
def _unit_state_callback (self, unit, state) : try : with self.lock : uid = unit.uid logger.info ("[SchedulerCallback]: Computeunit %s changed to %s" % (uid, state)) found_unit = False if state in [NEW, UNSCHEDULED] : for pid in self.runqs : if not pid : logger.warning ('cannot handle final unit %s w/o pilot information' % uid) if uid in self.runqs[pid] : logger.info ('reschedule NEW unit %s from %s' % (uid, pid)) unit = self.runqs[pid][uid] found_unit = True del self.runqs[pid][uid] self.waitq[uid] = unit # self._dump ('before reschedule %s' % uid) self._reschedule (uid=uid) # self._dump ('after reschedule %s' % uid) return # if not found_unit and uid not in self.waitq : # # as we cannot unregister callbacks, we simply ignore this # # invokation. Its probably from a unit we handled previously. # # (although this should have been final?) # # # # FIXME: how can I *un*register a unit callback? # logger.error ("[SchedulerCallback]: cannot handle unit %s" % uid) # self._dump() # return if state in [PENDING_OUTPUT_STAGING, STAGING_OUTPUT, DONE, FAILED, CANCELED] : # the pilot which owned this CU should now have free slots available # FIXME: how do I get the pilot from the CU? pid = unit.execution_details.get ('pilot', None) if not pid : raise RuntimeError ('cannot handle final unit %s w/o pilot information' % uid) if pid not in self.pilots : logger.warning ('cannot handle unit %s cb for pilot %s (pilot is gone)' % (uid, pid)) else : if uid in self.runqs[pid] : unit = self.runqs[pid][uid] del self.runqs[pid][uid] self.pilots[pid]['caps'] += unit.description.cores self._reschedule (target_pid=pid) found_unit = True # logger.debug ('unit %s frees %s cores on (-> %s)' \ # % (uid, unit.description.cores, pid, self.pilots[pid]['caps'])) if not found_unit : logger.warn ('unit %s freed %s cores on %s (== %s) -- not reused' % (uid, unit.description.cores, pid, self.pilots[pid]['caps'])) except Exception as e : logger.error ("error in unit callback for backfiller (%s) - ignored" % e)
def handle_schedule(self, schedule): # we want to use bulk submission to the pilots, so we collect all units # assigned to the same set of pilots. At the same time, we select # unscheduled units for later insertion into the wait queue. if not schedule: logger.debug('skipping empty unit schedule') return # print 'handle schedule:' # import pprint # pprint.pprint (schedule) # pilot_cu_map = dict() unscheduled = list() pilot_ids = self.list_pilots() for unit in schedule['units'].keys(): pid = schedule['units'][unit] if None == pid: unscheduled.append(unit) continue else: if pid not in pilot_ids: raise RuntimeError("schedule points to unknown pilot %s" % pid) if pid not in pilot_cu_map: pilot_cu_map[pid] = list() pilot_cu_map[pid].append(unit) # submit to all pilots which got something submitted to for pid in pilot_cu_map.keys(): units_to_schedule = list() # if a kernel name is in the cu descriptions set, do kernel expansion for unit in pilot_cu_map[pid]: if not pid in schedule['pilots']: # lost pilot, do not schedule unit logger.warn("unschedule unit %s, lost pilot %s" % (unit.uid, pid)) continue unit.sandbox = schedule['pilots'][pid]['sandbox'] + "/" + str( unit.uid) ud = unit.description if 'kernel' in ud and ud['kernel']: try: from radical.ensemblemd.mdkernels import MDTaskDescription except Exception as ex: logger.error ("Kernels are not supported in" \ "compute unit descriptions -- install " \ "radical.ensemblemd.mdkernels!") # FIXME: unit needs a '_set_state() method or something! self._session._dbs.set_compute_unit_state( unit._uid, FAILED, ["kernel expansion failed"]) continue pilot_resource = schedule['pilots'][pid]['resource'] mdtd = MDTaskDescription() mdtd.kernel = ud.kernel mdtd_bound = mdtd.bind(resource=pilot_resource) ud.environment = mdtd_bound.environment ud.pre_exec = mdtd_bound.pre_exec ud.executable = mdtd_bound.executable ud.mpi = mdtd_bound.mpi units_to_schedule.append(unit) if len(units_to_schedule): self._worker.schedule_compute_units(pilot_uid=pid, units=units_to_schedule) # report any change in wait_queue_size old_wait_queue_size = self.wait_queue_size self.wait_queue_size = len(unscheduled) if old_wait_queue_size != self.wait_queue_size: self._worker.fire_manager_callback(WAIT_QUEUE_SIZE, self, self.wait_queue_size) if len(unscheduled): self._worker.unschedule_compute_units(units=unscheduled) logger.info('%s units remain unscheduled' % len(unscheduled))
def schedule_compute_units(self, pilot_uid, units): """Request the scheduling of one or more ComputeUnits on a ComputePilot. """ try: cu_transfer = list() cu_notransfer = list() # Get some information about the pilot sandbox from the database. pilot_info = self._db.get_pilots(pilot_ids=pilot_uid) # TODO: this hack below relies on what?! That there is just one pilot? pilot_sandbox = pilot_info[0]['sandbox'] # Split units into two different lists: the first list contains the CUs # that need file transfer and the second list contains the CUs that # don't. The latter is added to the pilot directly, while the former # is added to the transfer queue. for unit in units: # Create object for staging status tracking unit.FTW_Input_Status = None unit.FTW_Input_Directives = [] unit.Agent_Input_Status = None unit.Agent_Input_Directives = [] unit.FTW_Output_Status = None unit.FTW_Output_Directives = [] unit.Agent_Output_Status = None unit.Agent_Output_Directives = [] # Split the input staging directives over the transfer worker and the agent input_sds = unit.description.input_staging if not isinstance(input_sds, list): # Ugly, but is a workaround for iterating on attribute interface # TODO: Verify if this piece of code is actually still required if input_sds: input_sds = [input_sds] else: input_sds = [] for input_sd_entry in input_sds: action = input_sd_entry['action'] source = Url(input_sd_entry['source']) target = Url(input_sd_entry['target']) new_sd = {'action': action, 'source': str(source), 'target': str(target), 'flags': input_sd_entry['flags'], 'priority': input_sd_entry['priority'], 'state': PENDING } if action in [LINK, COPY, MOVE]: unit.Agent_Input_Directives.append(new_sd) unit.Agent_Input_Status = PENDING elif action in [TRANSFER]: if source.scheme and source.scheme != 'file': # If there is a scheme and it is different than "file", # assume a remote pull from the agent unit.Agent_Input_Directives.append(new_sd) unit.Agent_Input_Status = PENDING else: # Transfer from local to sandbox unit.FTW_Input_Directives.append(new_sd) unit.FTW_Input_Status = PENDING else: logger.warn('Not sure if action %s makes sense for input staging' % action) # Split the output staging directives over the transfer worker and the agent output_sds = unit.description.output_staging if not isinstance(output_sds, list): # Ugly, but is a workaround for iterating on att iface # TODO: Verify if this piece of code is actually still required if output_sds: output_sds = [output_sds] else: output_sds = [] for output_sds_entry in output_sds: action = output_sds_entry['action'] source = Url(output_sds_entry['source']) target = Url(output_sds_entry['target']) new_sd = {'action': action, 'source': str(source), 'target': str(target), 'flags': output_sds_entry['flags'], 'priority': output_sds_entry['priority'], 'state': PENDING } if action == LINK or action == COPY or action == MOVE: unit.Agent_Output_Directives.append(new_sd) unit.Agent_Output_Status = NEW elif action == TRANSFER: if target.scheme and target.scheme != 'file': # If there is a scheme and it is different than "file", # assume a remote push from the agent unit.Agent_Output_Directives.append(new_sd) unit.Agent_Output_Status = NEW else: # Transfer from sandbox back to local unit.FTW_Output_Directives.append(new_sd) unit.FTW_Output_Status = NEW else: logger.warn('Not sure if action %s makes sense for output staging' % action) if unit.FTW_Input_Directives or unit.Agent_Input_Directives: log = "Scheduled for data transfer to ComputePilot %s." % pilot_uid self._db.set_compute_unit_state(unit.uid, PENDING_INPUT_STAGING, log) cu_transfer.append(unit) else: cu_notransfer.append(unit) # Bulk-add all non-transfer units- self._db.assign_compute_units_to_pilot( units=cu_notransfer, pilot_uid=pilot_uid, pilot_sandbox=pilot_sandbox ) self._db.assign_compute_units_to_pilot( units=cu_transfer, pilot_uid=pilot_uid, pilot_sandbox=pilot_sandbox ) for unit in cu_notransfer: log = "Scheduled for execution on ComputePilot %s." % pilot_uid self._db.set_compute_unit_state(unit.uid, PENDING_EXECUTION, log) #self._set_state(uid, PENDING_EXECUTION, log) logger.info( "Scheduled ComputeUnits %s for execution on ComputePilot '%s'." % (cu_notransfer, pilot_uid) ) except Exception, e: logger.exception ('error in unit manager controller (schedule())') raise
def _pilot_state_callback (self, pilot, state) : try : with self.lock : pid = pilot.uid if not pid in self.pilots : # as we cannot unregister callbacks, we simply ignore this # invokation. Its probably from a pilot we used previously. logger.warn ("[SchedulerCallback]: ComputePilot %s changed to %s (ignored)" % (pid, state)) return self.pilots[pid]['state'] = state logger.debug ("[SchedulerCallback]: ComputePilot %s changed to %s" % (pid, state)) if state in [ACTIVE] : # the pilot is now ready to be used self._reschedule (target_pid=pid) if state in [DONE, FAILED, CANCELED] : # self._dump ('pilot is final') # If the pilot state is 'DONE', 'FAILED' or 'CANCELED', we # need to reschedule the units which are reschedulable -- # all others are marked 'FAILED' if they are already # 'EXECUTING' and not restartable timestamp = datetime.datetime.utcnow() self._db.change_compute_units ( filter_dict = {"pilot" : pid, "state" : {"$in": [UNSCHEDULED, PENDING_INPUT_STAGING, STAGING_INPUT, PENDING_EXECUTION, SCHEDULING]}}, set_dict = {"state" : UNSCHEDULED, "pilot" : None}, push_dict = {"statehistory": {"state" : UNSCHEDULED, "timestamp" : timestamp}, "log" : {"message" : "reschedule unit", "timestamp" : timestamp} }) self._db.change_compute_units ( filter_dict = {"pilot" : pid, "restartable" : True, "state" : {"$in": [EXECUTING, PENDING_OUTPUT_STAGING, STAGING_OUTPUT]}}, set_dict = {"state" : UNSCHEDULED, "pilot" : None}, push_dict = {"statehistory": {"state" : UNSCHEDULED, "timestamp" : timestamp}, "log" : {"message" : "reschedule unit", "timestamp" : timestamp} }) self._db.change_compute_units ( filter_dict = {"pilot" : pid, "restartable" : False, "state" : {"$in": [EXECUTING, PENDING_OUTPUT_STAGING, STAGING_OUTPUT]}}, set_dict = {"state" : FAILED}, push_dict = {"statehistory": {"state" : FAILED, "timestamp" : timestamp}, "log" : {"message" : "reschedule unit", "timestamp" : timestamp} }) # make sure that restartable units got back into the # wait queue # # FIXME AM: f*****g state management: I don't have the # unit state! New state was just pushed to the DB, but # I have actually no idea for which units, and the state # known to the worker (i.e. the cached state) is most # likely outdated. # # So we don't handle runq/waitq here. Instead, we rely # on the unit cb to get invoked as soon as the state # propagated back to us, and then remove them from the # runq. This is slow, potentially very slow, but save. # we can't use this pilot anymore... del self.pilots[pid] # FIXME: how can I *un*register a pilot callback? except Exception as e : # import traceback # traceback.print_exc () logger.exception ("error in pilot callback for backfiller (%s) - ignored" % e) raise
def run(self): """Starts the process when Process.start() is called. """ # make sure to catch sys.exit (which raises SystemExit) try: # Get directory where this module lives mod_dir = os.path.dirname(os.path.realpath(__file__)) # Try to connect to the database try: connection = self.db_connection_info.get_db_handle() db = connection[self.db_connection_info.dbname] pilot_col = db["%s.p" % self.db_connection_info.session_id] logger.debug( "Connected to MongoDB. Serving requests for PilotManager %s." % self.pilot_manager_id) except Exception as e: logger.exception("Connection error: %s" % e) return last_job_check = time.time() while not self._stop.is_set(): # Periodically, we pull up all ComputePilots that are pending # execution or were last seen executing and check if the corresponding # SAGA job is still pending in the queue. If that is not the case, # we assume that the job has failed for some reasons and update # the state of the ComputePilot accordingly. if last_job_check + JOB_CHECK_INTERVAL < time.time(): last_job_check = time.time() self.check_pilot_states(pilot_col) # See if we can find a ComputePilot that is waiting to be launched. # If we find one, we use SAGA to create a job service, a job # description and a job that is then send to the local or remote # queueing system. If this succedes, we set the ComputePilot's # state to pending, otherwise to failed. compute_pilot = None ts = datetime.datetime.utcnow() compute_pilot = pilot_col.find_and_modify( query={ "pilotmanager": self.pilot_manager_id, "state": PENDING_LAUNCH }, update={ "$set": { "state": LAUNCHING }, "$push": { "statehistory": { "state": LAUNCHING, "timestamp": ts } } }) if not compute_pilot: time.sleep(IDLE_TIMER) else: try: # ------------------------------------------------------ # # LAUNCH THE PILOT AGENT VIA SAGA # logentries = [] pilot_id = str(compute_pilot["_id"]) logger.info("Launching ComputePilot %s" % pilot_id) # ------------------------------------------------------ # Database connection parameters session_uid = self.db_connection_info.session_id database_url = self.db_connection_info.dburl database_name = self.db_connection_info.dbname database_auth = self.db_connection_info.dbauth # ------------------------------------------------------ # pilot description and resource configuration number_cores = compute_pilot['description']['cores'] runtime = compute_pilot['description']['runtime'] queue = compute_pilot['description']['queue'] project = compute_pilot['description']['project'] cleanup = compute_pilot['description']['cleanup'] resource_key = compute_pilot['description']['resource'] schema = compute_pilot['description']['access_schema'] memory = compute_pilot['description']['memory'] pilot_sandbox = compute_pilot['sandbox'] global_sandbox = compute_pilot['global_sandbox'] # we expand and exchange keys in the resource config, # depending on the selected schema so better use a deep # copy.. resource_cfg = self._session.get_resource_config( resource_key, schema) # import pprint # pprint.pprint (resource_cfg) # ------------------------------------------------------ # get parameters from cfg, set defaults where needed agent_mongodb_endpoint = resource_cfg.get( 'agent_mongodb_endpoint', database_url) agent_spawner = resource_cfg.get( 'agent_spawner', DEFAULT_AGENT_SPAWNER) agent_type = resource_cfg.get('agent_type', DEFAULT_AGENT_TYPE) agent_scheduler = resource_cfg.get('agent_scheduler') tunnel_bind_device = resource_cfg.get( 'tunnel_bind_device') default_queue = resource_cfg.get('default_queue') forward_tunnel_endpoint = resource_cfg.get( 'forward_tunnel_endpoint') js_endpoint = resource_cfg.get('job_manager_endpoint') lrms = resource_cfg.get('lrms') mpi_launch_method = resource_cfg.get( 'mpi_launch_method') pre_bootstrap = resource_cfg.get('pre_bootstrap') python_interpreter = resource_cfg.get( 'python_interpreter') spmd_variation = resource_cfg.get('spmd_variation') task_launch_method = resource_cfg.get( 'task_launch_method') rp_version = resource_cfg.get('rp_version', DEFAULT_RP_VERSION) virtenv_mode = resource_cfg.get( 'virtenv_mode', DEFAULT_VIRTENV_MODE) virtenv = resource_cfg.get('virtenv', DEFAULT_VIRTENV) stage_cacerts = resource_cfg.get( 'stage_cacerts', 'False') if stage_cacerts.lower() == 'true': stage_cacerts = True else: stage_cacerts = False # expand variables in virtenv string virtenv = virtenv % { 'pilot_sandbox': saga.Url(pilot_sandbox).path, 'global_sandbox': saga.Url(global_sandbox).path } # Check for deprecated global_virtenv global_virtenv = resource_cfg.get('global_virtenv') if global_virtenv: logger.warn( "'global_virtenv' keyword is deprecated -- use 'virtenv' and 'virtenv_mode'" ) virtenv = global_virtenv virtenv_mode = 'use' # set default scheme, host, port and dbname if not set db_url = saga.Url(agent_mongodb_endpoint) if not db_url.scheme: db_url.scheme = 'mongodb' if not db_url.host: db_url.host = 'localhost' if not db_url.port: db_url.port = 27017 if not database_name: database_name = 'radicalpilot' # Create a host:port string for use by the bootstrapper. database_hostport = "%s:%d" % (db_url.host, db_url.port) # ------------------------------------------------------ # Copy the bootstrap shell script. This also creates # the sandbox. We use always "default_bootstrapper.sh" bootstrapper = 'default_bootstrapper.sh' bootstrapper_path = os.path.abspath("%s/../bootstrapper/%s" \ % (mod_dir, bootstrapper)) msg = "Using bootstrapper %s" % bootstrapper_path logentries.append(Logentry(msg, logger=logger.info)) bs_script_url = saga.Url("file://localhost/%s" % bootstrapper_path) bs_script_tgt = saga.Url("%s/pilot_bootstrapper.sh" % pilot_sandbox) msg = "Copying bootstrapper '%s' to agent sandbox (%s)." \ % (bs_script_url, bs_script_tgt) logentries.append(Logentry(msg, logger=logger.debug)) bs_script = saga.filesystem.File(bs_script_url, session=self._session) bs_script.copy(bs_script_tgt, flags=saga.filesystem.CREATE_PARENTS) bs_script.close() # ------------------------------------------------------ # the version of the agent is derived from # rp_version, which has the following format # and interpretation: # # case rp_version: # @<token>: # @tag/@branch/@commit: # no sdist staging # git clone $github_base radical.pilot.src # (cd radical.pilot.src && git checkout token) # pip install -t $VIRTENV/rp_install/ radical.pilot.src # rm -rf radical.pilot.src # export PYTHONPATH=$VIRTENV/rp_install:$PYTHONPATH # # release: # no sdist staging # pip install -t $VIRTENV/rp_install radical.pilot # export PYTHONPATH=$VIRTENV/rp_install:$PYTHONPATH # # local: # needs sdist staging # tar zxf $sdist.tgz # pip install -t $VIRTENV/rp_install $sdist/ # export PYTHONPATH=$VIRTENV/rp_install:$PYTHONPATH # # debug: # needs sdist staging # tar zxf $sdist.tgz # pip install -t $SANDBOX/rp_install $sdist/ # export PYTHONPATH=$SANDBOX/rp_install:$PYTHONPATH # # installed: # no sdist staging # true # esac # # virtenv_mode # private : error if ve exists, otherwise create, then use # update : update if ve exists, otherwise create, then use # create : use if ve exists, otherwise create, then use # use : use if ve exists, otherwise error, then exit # recreate: delete if ve exists, otherwise create, then use # # examples : # [email protected] # virtenv@devel # virtenv@release # virtenv@installed # stage@local # stage@/tmp/my_agent.py # # Note that some combinations may be invalid, # specifically in the context of virtenv_mode. If, for # example, virtenv_mode is 'use', then the 'virtenv:tag' # will not make sense, as the virtenv is not updated. # In those cases, the virtenv_mode is honored, and # a warning is printed. # # Also, the 'stage' mode can only be combined with the # 'local' source, or with a path to the agent (relative # to mod_dir, or absolute). # # A rp_version which does not adhere to the # above syntax is ignored, and the fallback stage@local # is used. if not rp_version.startswith('@') and \ not rp_version in ['installed', 'local', 'debug']: raise ValueError("invalid rp_version '%s'" % rp_version) stage_sdist = True if rp_version in ['installed', 'release']: stage_sdist = False if rp_version.startswith('@'): stage_sdist = False rp_version = rp_version[1:] # strip '@' # ------------------------------------------------------ # Copy the rp sdist if needed. We actually also stage # the sdists for radical.utils and radical.saga, so that # we have the complete stack to install... if stage_sdist: for path in [ ru.sdist_path, saga.sdist_path, sdist_path ]: sdist_url = saga.Url("file://localhost/%s" % path) msg = "Copying sdist '%s' to sdist sandbox (%s)." % ( sdist_url, pilot_sandbox) logentries.append( Logentry(msg, logger=logger.debug)) sdist_file = saga.filesystem.File(sdist_url) sdist_file.copy("%s/" % (str(pilot_sandbox))) sdist_file.close() # ------------------------------------------------------ # some machines cannot run pip due to outdated ca certs. # For those, we also stage an updated cert bundle if stage_cacerts: cc_path = os.path.abspath("%s/../bootstrapper/%s" \ % (mod_dir, 'cacert.pem.gz')) cc_script_url = saga.Url("file://localhost/%s" % cc_path) cc_script_tgt = saga.Url("%s/cacert.pem.gz" % pilot_sandbox) cc_script = saga.filesystem.File( cc_script_url, session=self._session) cc_script.copy( cc_script_tgt, flags=saga.filesystem.CREATE_PARENTS) cc_script.close() # ------------------------------------------------------ # sanity checks if not agent_spawner: raise RuntimeError("missing agent spawner") if not agent_scheduler: raise RuntimeError("missing agent scheduler") if not lrms: raise RuntimeError("missing LRMS") if not mpi_launch_method: raise RuntimeError("missing mpi launch method") if not task_launch_method: raise RuntimeError("missing task launch method") # massage some values debug_level = os.environ.get( 'RADICAL_PILOT_AGENT_VERBOSE', logger.level) try: debug_level = int(debug_level) except ValueError: debug_level = { 'CRITICAL': 1, 'ERROR': 2, 'WARNING': 3, 'WARN': 3, 'INFO': 4, 'DEBUG': 5 }.get(debug_level, 0) if not queue: queue = default_queue if cleanup and isinstance(cleanup, bool): cleanup = 'luve' # l : log files # u : unit work dirs # v : virtualenv # e : everything (== pilot sandbox) # # we never cleanup virtenvs which are not private if virtenv_mode is not 'private': cleanup = cleanup.replace('v', '') sdists = ':'.join( [ru.sdist_name, saga.sdist_name, sdist_name]) # set mandatory args bootstrap_args = "" bootstrap_args += " -b '%s'" % sdists bootstrap_args += " -c '%s'" % number_cores bootstrap_args += " -d '%s'" % debug_level bootstrap_args += " -g '%s'" % virtenv bootstrap_args += " -j '%s'" % task_launch_method bootstrap_args += " -k '%s'" % mpi_launch_method bootstrap_args += " -l '%s'" % lrms bootstrap_args += " -m '%s'" % database_hostport bootstrap_args += " -n '%s'" % database_name bootstrap_args += " -o '%s'" % agent_spawner bootstrap_args += " -p '%s'" % pilot_id bootstrap_args += " -q '%s'" % agent_scheduler bootstrap_args += " -r '%s'" % runtime bootstrap_args += " -s '%s'" % session_uid bootstrap_args += " -t '%s'" % agent_type bootstrap_args += " -u '%s'" % virtenv_mode bootstrap_args += " -v '%s'" % rp_version # set optional args if database_auth: bootstrap_args += " -a '%s'" % database_auth if tunnel_bind_device: bootstrap_args += " -D '%s'" % tunnel_bind_device if pre_bootstrap: bootstrap_args += " -e '%s'" % "' -e '".join( pre_bootstrap) if forward_tunnel_endpoint: bootstrap_args += " -f '%s'" % forward_tunnel_endpoint if python_interpreter: bootstrap_args += " -i '%s'" % python_interpreter if cleanup: bootstrap_args += " -x '%s'" % cleanup # ------------------------------------------------------ # now that the script is in place and we know where it is, # we can launch the agent js_url = saga.Url(js_endpoint) logger.debug("saga.job.Service ('%s')" % js_url) if js_url in self._shared_worker_data['job_services']: js = self._shared_worker_data['job_services'][ js_url] else: js = saga.job.Service(js_url, session=self._session) self._shared_worker_data['job_services'][ js_url] = js # ------------------------------------------------------ # Create SAGA Job description and submit the pilot job jd = saga.job.Description() jd.executable = "/bin/bash" jd.arguments = [ "-l pilot_bootstrapper.sh", bootstrap_args ] jd.working_directory = saga.Url(pilot_sandbox).path jd.project = project jd.output = "agent.out" jd.error = "agent.err" jd.total_cpu_count = number_cores jd.wall_time_limit = runtime jd.total_physical_memory = memory jd.queue = queue # Set the SPMD variation only if required if spmd_variation: jd.spmd_variation = spmd_variation if 'RADICAL_PILOT_PROFILE' in os.environ: jd.environment = {'RADICAL_PILOT_PROFILE': 'TRUE'} logger.debug("Bootstrap command line: %s %s" % (jd.executable, jd.arguments)) msg = "Submitting SAGA job with description: %s" % str( jd.as_dict()) logentries.append(Logentry(msg, logger=logger.debug)) pilotjob = js.create_job(jd) pilotjob.run() # do a quick error check if pilotjob.state == saga.FAILED: raise RuntimeError("SAGA Job state is FAILED.") saga_job_id = pilotjob.id self._shared_worker_data['job_ids'][pilot_id] = [ saga_job_id, js_url ] msg = "SAGA job submitted with job id %s" % str( saga_job_id) logentries.append(Logentry(msg, logger=logger.debug)) # # ------------------------------------------------------ log_dicts = list() for le in logentries: log_dicts.append(le.as_dict()) # Update the Pilot's state to 'PENDING_ACTIVE' if SAGA job submission was successful. ts = datetime.datetime.utcnow() ret = pilot_col.update( { "_id": pilot_id, "state": 'Launching' }, { "$set": { "state": PENDING_ACTIVE, "saga_job_id": saga_job_id }, "$push": { "statehistory": { "state": PENDING_ACTIVE, "timestamp": ts } }, "$pushAll": { "log": log_dicts } }) if ret['n'] == 0: # could not update, probably because the agent is # running already. Just update state history and # jobid then # FIXME: make sure of the agent state! ret = pilot_col.update({"_id": pilot_id}, { "$set": { "saga_job_id": saga_job_id }, "$push": { "statehistory": { "state": PENDING_ACTIVE, "timestamp": ts } }, "$pushAll": { "log": log_dicts } }) except Exception as e: # Update the Pilot's state 'FAILED'. out, err, log = self._get_pilot_logs( pilot_col, pilot_id) ts = datetime.datetime.utcnow() # FIXME: we seem to be unable to bson/json handle saga # log messages containing an '#'. This shows up here. # Until we find a clean workaround, make log shorter and # rely on saga logging to reveal the problem. msg = "Pilot launching failed! (%s)" % e logentries.append(Logentry(msg)) log_dicts = list() log_messages = list() for le in logentries: log_dicts.append(le.as_dict()) log_messages.append(le.message) pilot_col.update( { "_id": pilot_id, "state": { "$ne": FAILED } }, { "$set": { "state": FAILED, "stdout": out, "stderr": err, "logfile": log }, "$push": { "statehistory": { "state": FAILED, "timestamp": ts } }, "$pushAll": { "log": log_dicts } }) logger.exception('\n'.join(log_messages)) except SystemExit as e: logger.exception( "pilot launcher thread caught system exit -- forcing application shutdown" ) import thread thread.interrupt_main()
def check_pilot_states(self, pilot_col): pending_pilots = pilot_col.find({ "pilotmanager": self.pilot_manager_id, "state": { "$in": [PENDING_ACTIVE, ACTIVE] } }) for pending_pilot in pending_pilots: pilot_failed = False pilot_done = False reconnected = False pilot_id = pending_pilot["_id"] log_message = "" saga_job_id = pending_pilot["saga_job_id"] logger.info( "Performing periodical health check for %s (SAGA job id %s)" % (str(pilot_id), saga_job_id)) if not pilot_id in self.missing_pilots: self.missing_pilots[pilot_id] = 0 # Create a job service object: try: js_url = saga_job_id.split("]-[")[0][1:] if js_url in self._shared_worker_data['job_services']: js = self._shared_worker_data['job_services'][js_url] else: js = saga.job.Service(js_url, session=self._session) self._shared_worker_data['job_services'][js_url] = js saga_job = js.get_job(saga_job_id) reconnected = True if saga_job.state in [saga.job.FAILED, saga.job.CANCELED]: pilot_failed = True log_message = "SAGA job state for ComputePilot %s is %s."\ % (pilot_id, saga_job.state) if saga_job.state in [saga.job.DONE]: pilot_done = True log_message = "SAGA job state for ComputePilot %s is %s."\ % (pilot_id, saga_job.state) except Exception as e: if not reconnected: logger.warning( 'could not reconnect to pilot for state check (%s)' % e) self.missing_pilots[pilot_id] += 1 if self.missing_pilots[pilot_id] >= JOB_CHECK_MAX_MISSES: logger.debug('giving up after 10 attempts') pilot_failed = True log_message = "Could not reconnect to pilot %s "\ "multiple times - giving up" % pilot_id else: logger.warning('pilot state check failed: %s' % e) pilot_failed = True log_message = "Couldn't determine job state for ComputePilot %s. " \ "Assuming it has failed." % pilot_id if pilot_failed: out, err, log = self._get_pilot_logs(pilot_col, pilot_id) ts = datetime.datetime.utcnow() pilot_col.update({ "_id": pilot_id, "state": { "$ne": DONE } }, { "$set": { "state": FAILED, "stdout": out, "stderr": err, "logfile": log }, "$push": { "statehistory": { "state": FAILED, "timestamp": ts }, "log": { "message": log_message, "timestamp": ts } } }) logger.debug(log_message) logger.warn('pilot %s declared dead' % pilot_id) elif pilot_done: # FIXME: this should only be done if the state is not yet # done... out, err, log = self._get_pilot_logs(pilot_col, pilot_id) ts = datetime.datetime.utcnow() pilot_col.update({ "_id": pilot_id, "state": { "$ne": DONE } }, { "$set": { "state": DONE, "stdout": out, "stderr": err, "logfile": log }, "$push": { "statehistory": { "state": DONE, "timestamp": ts }, "log": { "message": log_message, "timestamp": ts } } }) logger.debug(log_message) logger.warn('pilot %s declared dead' % pilot_id) else: if self.missing_pilots[pilot_id]: logger.info ('pilot %s *assumed* alive and well (%s)' \ % (pilot_id, self.missing_pilots[pilot_id])) else: logger.info ('pilot %s seems alive and well' \ % (pilot_id))
def schedule_compute_units(self, pilot_uid, units): """Request the scheduling of one or more ComputeUnits on a ComputePilot. """ try: cu_transfer = list() cu_notransfer = list() # Get some information about the pilot sandbox from the database. pilot_info = self._db.get_pilots(pilot_ids=pilot_uid) # TODO: this hack below relies on what?! That there is just one pilot? pilot_sandbox = pilot_info[0]['sandbox'] # Split units into two different lists: the first list contains the CUs # that need file transfer and the second list contains the CUs that # don't. The latter is added to the pilot directly, while the former # is added to the transfer queue. for unit in units: # Create object for staging status tracking unit.FTW_Input_Status = None unit.FTW_Input_Directives = [] unit.Agent_Input_Status = None unit.Agent_Input_Directives = [] unit.FTW_Output_Status = None unit.FTW_Output_Directives = [] unit.Agent_Output_Status = None unit.Agent_Output_Directives = [] # Split the input staging directives over the transfer worker and the agent input_sds = unit.description.input_staging if not isinstance(input_sds, list): # Ugly, but is a workaround for iterating on attribute interface # TODO: Verify if this piece of code is actually still required if input_sds: input_sds = [input_sds] else: input_sds = [] for input_sd_entry in input_sds: action = input_sd_entry['action'] source = Url(input_sd_entry['source']) target = Url(input_sd_entry['target']) new_sd = { 'action': action, 'source': str(source), 'target': str(target), 'flags': input_sd_entry['flags'], 'priority': input_sd_entry['priority'], 'state': PENDING } if action in [LINK, COPY, MOVE]: unit.Agent_Input_Directives.append(new_sd) unit.Agent_Input_Status = PENDING elif action in [TRANSFER]: if source.scheme and source.scheme != 'file': # If there is a scheme and it is different than "file", # assume a remote pull from the agent unit.Agent_Input_Directives.append(new_sd) unit.Agent_Input_Status = PENDING else: # Transfer from local to sandbox unit.FTW_Input_Directives.append(new_sd) unit.FTW_Input_Status = PENDING else: logger.warn( 'Not sure if action %s makes sense for input staging' % action) # Split the output staging directives over the transfer worker and the agent output_sds = unit.description.output_staging if not isinstance(output_sds, list): # Ugly, but is a workaround for iterating on att iface # TODO: Verify if this piece of code is actually still required if output_sds: output_sds = [output_sds] else: output_sds = [] for output_sds_entry in output_sds: action = output_sds_entry['action'] source = Url(output_sds_entry['source']) target = Url(output_sds_entry['target']) new_sd = { 'action': action, 'source': str(source), 'target': str(target), 'flags': output_sds_entry['flags'], 'priority': output_sds_entry['priority'], 'state': PENDING } if action == LINK or action == COPY or action == MOVE: unit.Agent_Output_Directives.append(new_sd) unit.Agent_Output_Status = NEW elif action == TRANSFER: if target.scheme and target.scheme != 'file': # If there is a scheme and it is different than "file", # assume a remote push from the agent unit.Agent_Output_Directives.append(new_sd) unit.Agent_Output_Status = NEW else: # Transfer from sandbox back to local unit.FTW_Output_Directives.append(new_sd) unit.FTW_Output_Status = NEW else: logger.warn( 'Not sure if action %s makes sense for output staging' % action) if unit.FTW_Input_Directives or unit.Agent_Input_Directives: log = "Scheduled for data transfer to ComputePilot %s." % pilot_uid self._db.set_compute_unit_state(unit.uid, PENDING_INPUT_STAGING, log) cu_transfer.append(unit) else: cu_notransfer.append(unit) # Bulk-add all non-transfer units- self._db.assign_compute_units_to_pilot(units=cu_notransfer, pilot_uid=pilot_uid, pilot_sandbox=pilot_sandbox) self._db.assign_compute_units_to_pilot(units=cu_transfer, pilot_uid=pilot_uid, pilot_sandbox=pilot_sandbox) for unit in cu_notransfer: log = "Scheduled for execution on ComputePilot %s." % pilot_uid self._db.set_compute_unit_state(unit.uid, PENDING_EXECUTION, log) #self._set_state(uid, PENDING_EXECUTION, log) logger.info( "Scheduled ComputeUnits %s for execution on ComputePilot '%s'." % (cu_notransfer, pilot_uid)) except Exception, e: logger.exception('error in unit manager controller (schedule())') raise
def run(self): """Starts the process when Process.start() is called. """ # make sure to catch sys.exit (which raises SystemExit) try: # Get directory where this module lives mod_dir = os.path.dirname(os.path.realpath(__file__)) # Try to connect to the database try: connection = self.db_connection_info.get_db_handle() db = connection[self.db_connection_info.dbname] pilot_col = db["%s.p" % self.db_connection_info.session_id] logger.debug("Connected to MongoDB. Serving requests for PilotManager %s." % self.pilot_manager_id) except Exception as e: logger.exception("Connection error: %s" % e) return last_job_check = time.time() while not self._stop.is_set(): # Periodically, we pull up all ComputePilots that are pending # execution or were last seen executing and check if the corresponding # SAGA job is still pending in the queue. If that is not the case, # we assume that the job has failed for some reasons and update # the state of the ComputePilot accordingly. if last_job_check + JOB_CHECK_INTERVAL < time.time(): last_job_check = time.time() self.check_pilot_states(pilot_col) # See if we can find a ComputePilot that is waiting to be launched. # If we find one, we use SAGA to create a job service, a job # description and a job that is then send to the local or remote # queueing system. If this succedes, we set the ComputePilot's # state to pending, otherwise to failed. compute_pilot = None ts = datetime.datetime.utcnow() compute_pilot = pilot_col.find_and_modify( query={"pilotmanager": self.pilot_manager_id, "state": PENDING_LAUNCH}, update={ "$set": {"state": LAUNCHING}, "$push": {"statehistory": {"state": LAUNCHING, "timestamp": ts}}, }, ) if not compute_pilot: time.sleep(IDLE_TIMER) else: try: # ------------------------------------------------------ # # LAUNCH THE PILOT AGENT VIA SAGA # logentries = [] pilot_id = str(compute_pilot["_id"]) logger.info("Launching ComputePilot %s" % pilot_id) # ------------------------------------------------------ # Database connection parameters session_uid = self.db_connection_info.session_id database_url = self.db_connection_info.dburl database_name = self.db_connection_info.dbname database_auth = self.db_connection_info.dbauth # ------------------------------------------------------ # pilot description and resource configuration number_cores = compute_pilot["description"]["cores"] runtime = compute_pilot["description"]["runtime"] queue = compute_pilot["description"]["queue"] project = compute_pilot["description"]["project"] cleanup = compute_pilot["description"]["cleanup"] resource_key = compute_pilot["description"]["resource"] schema = compute_pilot["description"]["access_schema"] memory = compute_pilot["description"]["memory"] pilot_sandbox = compute_pilot["sandbox"] global_sandbox = compute_pilot["global_sandbox"] # we expand and exchange keys in the resource config, # depending on the selected schema so better use a deep # copy.. resource_cfg = self._session.get_resource_config(resource_key, schema) # import pprint # pprint.pprint (resource_cfg) # ------------------------------------------------------ # get parameters from cfg, set defaults where needed agent_mongodb_endpoint = resource_cfg.get("agent_mongodb_endpoint", database_url) agent_spawner = resource_cfg.get("agent_spawner", DEFAULT_AGENT_SPAWNER) agent_type = resource_cfg.get("agent_type", DEFAULT_AGENT_TYPE) agent_scheduler = resource_cfg.get("agent_scheduler") tunnel_bind_device = resource_cfg.get("tunnel_bind_device") default_queue = resource_cfg.get("default_queue") forward_tunnel_endpoint = resource_cfg.get("forward_tunnel_endpoint") js_endpoint = resource_cfg.get("job_manager_endpoint") lrms = resource_cfg.get("lrms") mpi_launch_method = resource_cfg.get("mpi_launch_method") pre_bootstrap = resource_cfg.get("pre_bootstrap") python_interpreter = resource_cfg.get("python_interpreter") spmd_variation = resource_cfg.get("spmd_variation") task_launch_method = resource_cfg.get("task_launch_method") rp_version = resource_cfg.get("rp_version", DEFAULT_RP_VERSION) virtenv_mode = resource_cfg.get("virtenv_mode", DEFAULT_VIRTENV_MODE) virtenv = resource_cfg.get("virtenv", DEFAULT_VIRTENV) stage_cacerts = resource_cfg.get("stage_cacerts", "False") if stage_cacerts.lower() == "true": stage_cacerts = True else: stage_cacerts = False # expand variables in virtenv string virtenv = virtenv % { "pilot_sandbox": saga.Url(pilot_sandbox).path, "global_sandbox": saga.Url(global_sandbox).path, } # Check for deprecated global_virtenv global_virtenv = resource_cfg.get("global_virtenv") if global_virtenv: logger.warn("'global_virtenv' keyword is deprecated -- use 'virtenv' and 'virtenv_mode'") virtenv = global_virtenv virtenv_mode = "use" # set default scheme, host, port and dbname if not set db_url = saga.Url(agent_mongodb_endpoint) if not db_url.scheme: db_url.scheme = "mongodb" if not db_url.host: db_url.host = "localhost" if not db_url.port: db_url.port = 27017 if not database_name: database_name = "radicalpilot" # Create a host:port string for use by the bootstrapper. database_hostport = "%s:%d" % (db_url.host, db_url.port) # ------------------------------------------------------ # Copy the bootstrap shell script. This also creates # the sandbox. We use always "default_bootstrapper.sh" bootstrapper = "default_bootstrapper.sh" bootstrapper_path = os.path.abspath("%s/../bootstrapper/%s" % (mod_dir, bootstrapper)) msg = "Using bootstrapper %s" % bootstrapper_path logentries.append(Logentry(msg, logger=logger.info)) bs_script_url = saga.Url("file://localhost/%s" % bootstrapper_path) bs_script_tgt = saga.Url("%s/pilot_bootstrapper.sh" % pilot_sandbox) msg = "Copying bootstrapper '%s' to agent sandbox (%s)." % (bs_script_url, bs_script_tgt) logentries.append(Logentry(msg, logger=logger.debug)) bs_script = saga.filesystem.File(bs_script_url, session=self._session) bs_script.copy(bs_script_tgt, flags=saga.filesystem.CREATE_PARENTS) bs_script.close() # ------------------------------------------------------ # the version of the agent is derived from # rp_version, which has the following format # and interpretation: # # case rp_version: # @<token>: # @tag/@branch/@commit: # no sdist staging # git clone $github_base radical.pilot.src # (cd radical.pilot.src && git checkout token) # pip install -t $VIRTENV/rp_install/ radical.pilot.src # rm -rf radical.pilot.src # export PYTHONPATH=$VIRTENV/rp_install:$PYTHONPATH # # release: # no sdist staging # pip install -t $VIRTENV/rp_install radical.pilot # export PYTHONPATH=$VIRTENV/rp_install:$PYTHONPATH # # local: # needs sdist staging # tar zxf $sdist.tgz # pip install -t $VIRTENV/rp_install $sdist/ # export PYTHONPATH=$VIRTENV/rp_install:$PYTHONPATH # # debug: # needs sdist staging # tar zxf $sdist.tgz # pip install -t $SANDBOX/rp_install $sdist/ # export PYTHONPATH=$SANDBOX/rp_install:$PYTHONPATH # # installed: # no sdist staging # true # esac # # virtenv_mode # private : error if ve exists, otherwise create, then use # update : update if ve exists, otherwise create, then use # create : use if ve exists, otherwise create, then use # use : use if ve exists, otherwise error, then exit # recreate: delete if ve exists, otherwise create, then use # # examples : # [email protected] # virtenv@devel # virtenv@release # virtenv@installed # stage@local # stage@/tmp/my_agent.py # # Note that some combinations may be invalid, # specifically in the context of virtenv_mode. If, for # example, virtenv_mode is 'use', then the 'virtenv:tag' # will not make sense, as the virtenv is not updated. # In those cases, the virtenv_mode is honored, and # a warning is printed. # # Also, the 'stage' mode can only be combined with the # 'local' source, or with a path to the agent (relative # to mod_dir, or absolute). # # A rp_version which does not adhere to the # above syntax is ignored, and the fallback stage@local # is used. if not rp_version.startswith("@") and not rp_version in ["installed", "local", "debug"]: raise ValueError("invalid rp_version '%s'" % rp_version) stage_sdist = True if rp_version in ["installed", "release"]: stage_sdist = False if rp_version.startswith("@"): stage_sdist = False rp_version = rp_version[1:] # strip '@' # ------------------------------------------------------ # Copy the rp sdist if needed. We actually also stage # the sdists for radical.utils and radical.saga, so that # we have the complete stack to install... if stage_sdist: for path in [ru.sdist_path, saga.sdist_path, sdist_path]: sdist_url = saga.Url("file://localhost/%s" % path) msg = "Copying sdist '%s' to sdist sandbox (%s)." % (sdist_url, pilot_sandbox) logentries.append(Logentry(msg, logger=logger.debug)) sdist_file = saga.filesystem.File(sdist_url) sdist_file.copy("%s/" % (str(pilot_sandbox))) sdist_file.close() # ------------------------------------------------------ # some machines cannot run pip due to outdated ca certs. # For those, we also stage an updated cert bundle if stage_cacerts: cc_path = os.path.abspath("%s/../bootstrapper/%s" % (mod_dir, "cacert.pem.gz")) cc_script_url = saga.Url("file://localhost/%s" % cc_path) cc_script_tgt = saga.Url("%s/cacert.pem.gz" % pilot_sandbox) cc_script = saga.filesystem.File(cc_script_url, session=self._session) cc_script.copy(cc_script_tgt, flags=saga.filesystem.CREATE_PARENTS) cc_script.close() # ------------------------------------------------------ # sanity checks if not agent_spawner: raise RuntimeError("missing agent spawner") if not agent_scheduler: raise RuntimeError("missing agent scheduler") if not lrms: raise RuntimeError("missing LRMS") if not mpi_launch_method: raise RuntimeError("missing mpi launch method") if not task_launch_method: raise RuntimeError("missing task launch method") # massage some values debug_level = os.environ.get("RADICAL_PILOT_AGENT_VERBOSE", logger.level) try: debug_level = int(debug_level) except ValueError: debug_level = { "CRITICAL": 1, "ERROR": 2, "WARNING": 3, "WARN": 3, "INFO": 4, "DEBUG": 5, }.get(debug_level, 0) if not queue: queue = default_queue if cleanup and isinstance(cleanup, bool): cleanup = "luve" # l : log files # u : unit work dirs # v : virtualenv # e : everything (== pilot sandbox) # # we never cleanup virtenvs which are not private if virtenv_mode is not "private": cleanup = cleanup.replace("v", "") sdists = ":".join([ru.sdist_name, saga.sdist_name, sdist_name]) # set mandatory args bootstrap_args = "" bootstrap_args += " -b '%s'" % sdists bootstrap_args += " -c '%s'" % number_cores bootstrap_args += " -d '%s'" % debug_level bootstrap_args += " -g '%s'" % virtenv bootstrap_args += " -j '%s'" % task_launch_method bootstrap_args += " -k '%s'" % mpi_launch_method bootstrap_args += " -l '%s'" % lrms bootstrap_args += " -m '%s'" % database_hostport bootstrap_args += " -n '%s'" % database_name bootstrap_args += " -o '%s'" % agent_spawner bootstrap_args += " -p '%s'" % pilot_id bootstrap_args += " -q '%s'" % agent_scheduler bootstrap_args += " -r '%s'" % runtime bootstrap_args += " -s '%s'" % session_uid bootstrap_args += " -t '%s'" % agent_type bootstrap_args += " -u '%s'" % virtenv_mode bootstrap_args += " -v '%s'" % rp_version # set optional args if database_auth: bootstrap_args += " -a '%s'" % database_auth if tunnel_bind_device: bootstrap_args += " -D '%s'" % tunnel_bind_device if pre_bootstrap: bootstrap_args += " -e '%s'" % "' -e '".join(pre_bootstrap) if forward_tunnel_endpoint: bootstrap_args += " -f '%s'" % forward_tunnel_endpoint if python_interpreter: bootstrap_args += " -i '%s'" % python_interpreter if cleanup: bootstrap_args += " -x '%s'" % cleanup # ------------------------------------------------------ # now that the script is in place and we know where it is, # we can launch the agent js_url = saga.Url(js_endpoint) logger.debug("saga.job.Service ('%s')" % js_url) if js_url in self._shared_worker_data["job_services"]: js = self._shared_worker_data["job_services"][js_url] else: js = saga.job.Service(js_url, session=self._session) self._shared_worker_data["job_services"][js_url] = js # ------------------------------------------------------ # Create SAGA Job description and submit the pilot job jd = saga.job.Description() jd.executable = "/bin/bash" jd.arguments = ["-l pilot_bootstrapper.sh", bootstrap_args] jd.working_directory = saga.Url(pilot_sandbox).path jd.project = project jd.output = "agent.out" jd.error = "agent.err" jd.total_cpu_count = number_cores jd.wall_time_limit = runtime jd.total_physical_memory = memory jd.queue = queue # Set the SPMD variation only if required if spmd_variation: jd.spmd_variation = spmd_variation if "RADICAL_PILOT_PROFILE" in os.environ: jd.environment = {"RADICAL_PILOT_PROFILE": "TRUE"} logger.debug("Bootstrap command line: %s %s" % (jd.executable, jd.arguments)) msg = "Submitting SAGA job with description: %s" % str(jd.as_dict()) logentries.append(Logentry(msg, logger=logger.debug)) pilotjob = js.create_job(jd) pilotjob.run() # do a quick error check if pilotjob.state == saga.FAILED: raise RuntimeError("SAGA Job state is FAILED.") saga_job_id = pilotjob.id self._shared_worker_data["job_ids"][pilot_id] = [saga_job_id, js_url] msg = "SAGA job submitted with job id %s" % str(saga_job_id) logentries.append(Logentry(msg, logger=logger.debug)) # # ------------------------------------------------------ log_dicts = list() for le in logentries: log_dicts.append(le.as_dict()) # Update the Pilot's state to 'PENDING_ACTIVE' if SAGA job submission was successful. ts = datetime.datetime.utcnow() ret = pilot_col.update( {"_id": pilot_id, "state": "Launching"}, { "$set": {"state": PENDING_ACTIVE, "saga_job_id": saga_job_id}, "$push": {"statehistory": {"state": PENDING_ACTIVE, "timestamp": ts}}, "$pushAll": {"log": log_dicts}, }, ) if ret["n"] == 0: # could not update, probably because the agent is # running already. Just update state history and # jobid then # FIXME: make sure of the agent state! ret = pilot_col.update( {"_id": pilot_id}, { "$set": {"saga_job_id": saga_job_id}, "$push": {"statehistory": {"state": PENDING_ACTIVE, "timestamp": ts}}, "$pushAll": {"log": log_dicts}, }, ) except Exception as e: # Update the Pilot's state 'FAILED'. out, err, log = self._get_pilot_logs(pilot_col, pilot_id) ts = datetime.datetime.utcnow() # FIXME: we seem to be unable to bson/json handle saga # log messages containing an '#'. This shows up here. # Until we find a clean workaround, make log shorter and # rely on saga logging to reveal the problem. msg = "Pilot launching failed! (%s)" % e logentries.append(Logentry(msg)) log_dicts = list() log_messages = list() for le in logentries: log_dicts.append(le.as_dict()) log_messages.append(le.message) pilot_col.update( {"_id": pilot_id, "state": {"$ne": FAILED}}, { "$set": {"state": FAILED, "stdout": out, "stderr": err, "logfile": log}, "$push": {"statehistory": {"state": FAILED, "timestamp": ts}}, "$pushAll": {"log": log_dicts}, }, ) logger.exception("\n".join(log_messages)) except SystemExit as e: logger.exception("pilot launcher thread caught system exit -- forcing application shutdown") import thread thread.interrupt_main()
def check_pilot_states(self, pilot_col): pending_pilots = pilot_col.find( {"pilotmanager": self.pilot_manager_id, "state": {"$in": [PENDING_ACTIVE, ACTIVE]}} ) for pending_pilot in pending_pilots: pilot_failed = False pilot_done = False reconnected = False pilot_id = pending_pilot["_id"] log_message = "" saga_job_id = pending_pilot["saga_job_id"] logger.info("Performing periodical health check for %s (SAGA job id %s)" % (str(pilot_id), saga_job_id)) if not pilot_id in self.missing_pilots: self.missing_pilots[pilot_id] = 0 # Create a job service object: try: js_url = saga_job_id.split("]-[")[0][1:] if js_url in self._shared_worker_data["job_services"]: js = self._shared_worker_data["job_services"][js_url] else: js = saga.job.Service(js_url, session=self._session) self._shared_worker_data["job_services"][js_url] = js saga_job = js.get_job(saga_job_id) reconnected = True if saga_job.state in [saga.job.FAILED, saga.job.CANCELED]: pilot_failed = True log_message = "SAGA job state for ComputePilot %s is %s." % (pilot_id, saga_job.state) if saga_job.state in [saga.job.DONE]: pilot_done = True log_message = "SAGA job state for ComputePilot %s is %s." % (pilot_id, saga_job.state) except Exception as e: if not reconnected: logger.warning("could not reconnect to pilot for state check (%s)" % e) self.missing_pilots[pilot_id] += 1 if self.missing_pilots[pilot_id] >= JOB_CHECK_MAX_MISSES: logger.debug("giving up after 10 attempts") pilot_failed = True log_message = "Could not reconnect to pilot %s " "multiple times - giving up" % pilot_id else: logger.warning("pilot state check failed: %s" % e) pilot_failed = True log_message = ( "Couldn't determine job state for ComputePilot %s. " "Assuming it has failed." % pilot_id ) if pilot_failed: out, err, log = self._get_pilot_logs(pilot_col, pilot_id) ts = datetime.datetime.utcnow() pilot_col.update( {"_id": pilot_id, "state": {"$ne": DONE}}, { "$set": {"state": FAILED, "stdout": out, "stderr": err, "logfile": log}, "$push": { "statehistory": {"state": FAILED, "timestamp": ts}, "log": {"message": log_message, "timestamp": ts}, }, }, ) logger.debug(log_message) logger.warn("pilot %s declared dead" % pilot_id) elif pilot_done: # FIXME: this should only be done if the state is not yet # done... out, err, log = self._get_pilot_logs(pilot_col, pilot_id) ts = datetime.datetime.utcnow() pilot_col.update( {"_id": pilot_id, "state": {"$ne": DONE}}, { "$set": {"state": DONE, "stdout": out, "stderr": err, "logfile": log}, "$push": { "statehistory": {"state": DONE, "timestamp": ts}, "log": {"message": log_message, "timestamp": ts}, }, }, ) logger.debug(log_message) logger.warn("pilot %s declared dead" % pilot_id) else: if self.missing_pilots[pilot_id]: logger.info("pilot %s *assumed* alive and well (%s)" % (pilot_id, self.missing_pilots[pilot_id])) else: logger.info("pilot %s seems alive and well" % (pilot_id))