def get_resource_config (self, resource_key, schema=None): """Returns a dictionary of the requested resource config """ if resource_key in self._resource_aliases : logger.warning ("using alias '%s' for deprecated resource key '%s'" \ % (self._resource_aliases[resource_key], resource_key)) resource_key = self._resource_aliases[resource_key] if resource_key not in self._resource_configs: error_msg = "Resource key '%s' is not known." % resource_key raise PilotException(error_msg) resource_cfg = copy.deepcopy (self._resource_configs[resource_key]) if not schema : if 'schemas' in resource_cfg : schema = resource_cfg['schemas'][0] if schema: if schema not in resource_cfg : raise RuntimeError ("schema %s unknown for resource %s" \ % (schema, resource_key)) for key in resource_cfg[schema] : # merge schema specific resource keys into the # resource config resource_cfg[key] = resource_cfg[schema][key] return resource_cfg
def add_pilots(self, pilots): """Associates one or more pilots with the unit manager. **Arguments:** * **pilots** [:class:`radical.pilot.ComputePilot` or list of :class:`radical.pilot.ComputePilot`]: The pilot objects that will be added to the unit manager. **Raises:** * :class:`radical.pilot.PilotException` """ if not self._uid: raise IncorrectState(msg="Invalid object instance.") if not isinstance(pilots, list): pilots = [pilots] pilot_ids = self.list_pilots() for pilot in pilots : if pilot.uid in pilot_ids : logger.warning ('adding the same pilot twice (%s)' % pilot.uid) self._worker.add_pilots(pilots) # let the scheduler know... for pilot in pilots : self._scheduler.pilot_added (pilot) # also keep the instances around for pilot in pilots : self._pilots.append (pilot)
def add_pilots(self, pilots): """Associates one or more pilots with the unit manager. **Arguments:** * **pilots** [:class:`radical.pilot.ComputePilot` or list of :class:`radical.pilot.ComputePilot`]: The pilot objects that will be added to the unit manager. **Raises:** * :class:`radical.pilot.PilotException` """ if not self._uid: raise IncorrectState(msg="Invalid object instance.") if not isinstance(pilots, list): pilots = [pilots] pilot_ids = self.list_pilots() for pilot in pilots: if pilot.uid in pilot_ids: logger.warning('adding the same pilot twice (%s)' % pilot.uid) self._worker.add_pilots(pilots) # let the scheduler know... for pilot in pilots: self._scheduler.pilot_added(pilot) # also keep the instances around for pilot in pilots: self._pilots.append(pilot)
def get_resource_config(self, resource_key, schema=None): """Returns a dictionary of the requested resource config """ if resource_key in self._resource_aliases: logger.warning ("using alias '%s' for deprecated resource key '%s'" \ % (self._resource_aliases[resource_key], resource_key)) resource_key = self._resource_aliases[resource_key] if resource_key not in self._resource_configs: error_msg = "Resource key '%s' is not known." % resource_key raise PilotException(error_msg) resource_cfg = copy.deepcopy(self._resource_configs[resource_key]) if not schema: if 'schemas' in resource_cfg: schema = resource_cfg['schemas'][0] if schema: if schema not in resource_cfg: raise RuntimeError ("schema %s unknown for resource %s" \ % (schema, resource_key)) for key in resource_cfg[schema]: # merge schema specific resource keys into the # resource config resource_cfg[key] = resource_cfg[schema][key] return resource_cfg
def close(self, cleanup=True, terminate=True, delete=None): """Closes the session. All subsequent attempts access objects attached to the session will result in an error. If cleanup is set to True (default) the session data is removed from the database. **Arguments:** * **cleanup** (`bool`): Remove session from MongoDB (implies * terminate) * **terminate** (`bool`): Shut down all pilots associated with the session. **Raises:** * :class:`radical.pilot.IncorrectState` if the session is closed or doesn't exist. """ logger.debug("session %s closing" % (str(self._uid))) uid = self._uid if not self._uid: logger.error("Session object already closed.") return # we keep 'delete' for backward compatibility. If it was set, and the # other flags (cleanup, terminate) are as defaulted (True), then delete # will supercede them. Delete is considered deprecated though, and # we'll thus issue a warning. if delete != None: if cleanup == True and terminate == True: cleanup = delete terminate = delete logger.warning("'delete' flag on session is deprecated. " \ "Please use 'cleanup' and 'terminate' instead!") if cleanup: # cleanup implies terminate terminate = True for pmgr in self._pilot_manager_objects: logger.debug("session %s closes pmgr %s" % (str(self._uid), pmgr._uid)) pmgr.close(terminate=terminate) logger.debug("session %s closed pmgr %s" % (str(self._uid), pmgr._uid)) for umgr in self._unit_manager_objects: logger.debug("session %s closes umgr %s" % (str(self._uid), umgr._uid)) umgr.close() logger.debug("session %s closed umgr %s" % (str(self._uid), umgr._uid)) if cleanup: self._destroy_db_entry() logger.debug("session %s closed" % (str(self._uid)))
def call_unit_state_callbacks(self, unit_id, new_state): """Wrapper function to call all all relevant callbacks, on unit-level as well as manager-level. """ # this is the point where, at the earliest, the application could have # been notified about unit state changes. So we record that event. if not unit_id in self._callback_histories: self._callback_histories[unit_id] = list() self._callback_histories[unit_id].append({ 'timestamp': datetime.datetime.utcnow(), 'state': new_state }) for [cb, cb_data] in self._shared_data[unit_id]['callbacks']: try: if self._shared_data[unit_id]['facade_object']: if cb_data: cb(self._shared_data[unit_id]['facade_object'], new_state, cb_data) else: cb(self._shared_data[unit_id]['facade_object'], new_state) else: logger.error("Couldn't call callback (no pilot instance)") except Exception as e: logger.exception("Couldn't call callback function %s" % e) raise # If we have any manager-level callbacks registered, we # call those as well! if not UNIT_STATE in self._manager_callbacks: self._manager_callbacks[UNIT_STATE] = list() for [cb, cb_data] in self._manager_callbacks[UNIT_STATE]: if not self._shared_data[unit_id]['facade_object']: logger.warning('skip cb for incomple unit (%s: %s)' % (unit_id, new_state)) break try: if cb_data: cb(self._shared_data[unit_id]['facade_object'], new_state, cb_data) else: cb(self._shared_data[unit_id]['facade_object'], new_state) except Exception as e: logger.exception("Couldn't call callback function %s" % e) raise # If we meet a final state, we record the object's callback history for # later evaluation. if new_state in (DONE, FAILED, CANCELED): self._db.publish_compute_unit_callback_history( unit_id, self._callback_histories[unit_id])
def close(self, cleanup=True, terminate=True, delete=None): """Closes the session. All subsequent attempts access objects attached to the session will result in an error. If cleanup is set to True (default) the session data is removed from the database. **Arguments:** * **cleanup** (`bool`): Remove session from MongoDB (implies * terminate) * **terminate** (`bool`): Shut down all pilots associated with the session. **Raises:** * :class:`radical.pilot.IncorrectState` if the session is closed or doesn't exist. """ logger.debug("session %s closing" % (str(self._uid))) uid = self._uid if not self._uid: logger.error("Session object already closed.") return # we keep 'delete' for backward compatibility. If it was set, and the # other flags (cleanup, terminate) are as defaulted (True), then delete # will supercede them. Delete is considered deprecated though, and # we'll thus issue a warning. if delete != None: if cleanup == True and terminate == True : cleanup = delete terminate = delete logger.warning("'delete' flag on session is deprecated. " \ "Please use 'cleanup' and 'terminate' instead!") if cleanup : # cleanup implies terminate terminate = True for pmgr in self._pilot_manager_objects: logger.debug("session %s closes pmgr %s" % (str(self._uid), pmgr._uid)) pmgr.close (terminate=terminate) logger.debug("session %s closed pmgr %s" % (str(self._uid), pmgr._uid)) for umgr in self._unit_manager_objects: logger.debug("session %s closes umgr %s" % (str(self._uid), umgr._uid)) umgr.close() logger.debug("session %s closed umgr %s" % (str(self._uid), umgr._uid)) if cleanup : self._destroy_db_entry() logger.debug("session %s closed" % (str(self._uid)))
def call_unit_state_callbacks(self, unit_id, new_state): """Wrapper function to call all all relevant callbacks, on unit-level as well as manager-level. """ # this is the point where, at the earliest, the application could have # been notified about unit state changes. So we record that event. if not unit_id in self._callback_histories : self._callback_histories[unit_id] = list() self._callback_histories[unit_id].append ( {'timestamp' : datetime.datetime.utcnow(), 'state' : new_state}) for [cb, cb_data] in self._shared_data[unit_id]['callbacks']: try: if self._shared_data[unit_id]['facade_object'] : if cb_data : cb(self._shared_data[unit_id]['facade_object'], new_state, cb_data) else : cb(self._shared_data[unit_id]['facade_object'], new_state) else : logger.error("Couldn't call callback (no pilot instance)") except Exception as e: logger.exception( "Couldn't call callback function %s" % e) raise # If we have any manager-level callbacks registered, we # call those as well! if not UNIT_STATE in self._manager_callbacks : self._manager_callbacks[UNIT_STATE] = list() for [cb, cb_data] in self._manager_callbacks[UNIT_STATE]: if not self._shared_data[unit_id]['facade_object'] : logger.warning ('skip cb for incomple unit (%s: %s)' % (unit_id, new_state)) break try: if cb_data : cb(self._shared_data[unit_id]['facade_object'], new_state, cb_data) else : cb(self._shared_data[unit_id]['facade_object'], new_state) except Exception as e: logger.exception( "Couldn't call callback function %s" % e) raise # If we meet a final state, we record the object's callback history for # later evaluation. if new_state in (DONE, FAILED, CANCELED) : self._db.publish_compute_unit_callback_history (unit_id, self._callback_histories[unit_id])
def close(self): """Shuts down the UnitManager and its background workers in a coordinated fashion. """ if not self._uid: logger.warning("UnitManager object already closed.") return if self._worker is not None: self._worker.stop() # Remove worker from registry self._session._process_registry.remove(self._uid) logger.info("Closed UnitManager %s." % str(self._uid)) self._uid = None
def submit_pilots(self, pilot_descriptions): """Submits a new :class:`radical.pilot.ComputePilot` to a resource. **Returns:** * One or more :class:`radical.pilot.ComputePilot` instances [`list of :class:`radical.pilot.ComputePilot`]. **Raises:** * :class:`radical.pilot.PilotException` """ # Check if the object instance is still valid. self._assert_obj_is_valid() # Implicit list conversion. return_list_type = True if not isinstance(pilot_descriptions, list): return_list_type = False pilot_descriptions = [pilot_descriptions] # Itereate over the pilot descriptions, try to create a pilot for # each one and append it to 'pilot_obj_list'. pilot_obj_list = list() for pilot_description in pilot_descriptions: if pilot_description.resource is None: error_msg = "ComputePilotDescription does not define mandatory attribute 'resource'." raise BadParameter(error_msg) elif pilot_description.runtime is None: error_msg = "ComputePilotDescription does not define mandatory attribute 'runtime'." raise BadParameter(error_msg) elif pilot_description.cores is None: error_msg = "ComputePilotDescription does not define mandatory attribute 'cores'." raise BadParameter(error_msg) resource_key = pilot_description.resource resource_cfg = self._session.get_resource_config(resource_key) # Check resource-specific mandatory attributes if "mandatory_args" in resource_cfg: for ma in resource_cfg["mandatory_args"]: if getattr(pilot_description, ma) is None: error_msg = "ComputePilotDescription does not define attribute '{0}' which is required for '{1}'.".format( ma, resource_key) raise BadParameter(error_msg) # we expand and exchange keys in the resource config, depending on # the selected schema so better use a deep copy... import copy resource_cfg = copy.deepcopy(resource_cfg) schema = pilot_description['access_schema'] if not schema: if 'schemas' in resource_cfg: schema = resource_cfg['schemas'][0] # import pprint # print "no schema, using %s" % schema # pprint.pprint (pilot_description) if not schema in resource_cfg: # import pprint # pprint.pprint (resource_cfg) logger.warning ("schema %s unknown for resource %s -- continue with defaults" \ % (schema, resource_key)) else: for key in resource_cfg[schema]: # merge schema specific resource keys into the # resource config resource_cfg[key] = resource_cfg[schema][key] # If 'default_sandbox' is defined, set it. if pilot_description.sandbox is not None: if "valid_roots" in resource_cfg and resource_cfg[ "valid_roots"] is not None: is_valid = False for vr in resource_cfg["valid_roots"]: if pilot_description.sandbox.startswith(vr): is_valid = True if is_valid is False: raise BadParameter( "Working directory for resource '%s' defined as '%s' but needs to be rooted in %s " % (resource_key, pilot_description.sandbox, resource_cfg["valid_roots"])) # After the sanity checks have passed, we can register a pilot # startup request with the worker process and create a facade # object. pilot = ComputePilot.create(pilot_description=pilot_description, pilot_manager_obj=self) pilot_uid = self._worker.register_start_pilot_request( pilot=pilot, resource_config=resource_cfg) pilot._uid = pilot_uid pilot_obj_list.append(pilot) # Implicit return value conversion if return_list_type: return pilot_obj_list else: return pilot_obj_list[0]
def submit_pilots(self, pilot_descriptions): """Submits a new :class:`radical.pilot.ComputePilot` to a resource. **Returns:** * One or more :class:`radical.pilot.ComputePilot` instances [`list of :class:`radical.pilot.ComputePilot`]. **Raises:** * :class:`radical.pilot.PilotException` """ # Check if the object instance is still valid. self._assert_obj_is_valid() # Implicit list conversion. return_list_type = True if not isinstance(pilot_descriptions, list): return_list_type = False pilot_descriptions = [pilot_descriptions] # Itereate over the pilot descriptions, try to create a pilot for # each one and append it to 'pilot_obj_list'. pilot_obj_list = list() for pilot_description in pilot_descriptions: if pilot_description.resource is None: error_msg = "ComputePilotDescription does not define mandatory attribute 'resource'." raise BadParameter(error_msg) elif pilot_description.runtime is None: error_msg = "ComputePilotDescription does not define mandatory attribute 'runtime'." raise BadParameter(error_msg) elif pilot_description.cores is None: error_msg = "ComputePilotDescription does not define mandatory attribute 'cores'." raise BadParameter(error_msg) resource_key = pilot_description.resource resource_cfg = self._session.get_resource_config(resource_key) # Check resource-specific mandatory attributes if "mandatory_args" in resource_cfg: for ma in resource_cfg["mandatory_args"]: if getattr(pilot_description, ma) is None: error_msg = "ComputePilotDescription does not define attribute '{0}' which is required for '{1}'.".format(ma, resource_key) raise BadParameter(error_msg) # we expand and exchange keys in the resource config, depending on # the selected schema so better use a deep copy... import copy resource_cfg = copy.deepcopy (resource_cfg) schema = pilot_description['access_schema'] if not schema : if 'schemas' in resource_cfg : schema = resource_cfg['schemas'][0] # import pprint # print "no schema, using %s" % schema # pprint.pprint (pilot_description) if not schema in resource_cfg : # import pprint # pprint.pprint (resource_cfg) logger.warning ("schema %s unknown for resource %s -- continue with defaults" \ % (schema, resource_key)) else : for key in resource_cfg[schema] : # merge schema specific resource keys into the # resource config resource_cfg[key] = resource_cfg[schema][key] # If 'default_sandbox' is defined, set it. if pilot_description.sandbox is not None: if "valid_roots" in resource_cfg and resource_cfg["valid_roots"] is not None: is_valid = False for vr in resource_cfg["valid_roots"]: if pilot_description.sandbox.startswith(vr): is_valid = True if is_valid is False: raise BadParameter("Working directory for resource '%s' defined as '%s' but needs to be rooted in %s " % (resource_key, pilot_description.sandbox, resource_cfg["valid_roots"])) # After the sanity checks have passed, we can register a pilot # startup request with the worker process and create a facade # object. pilot = ComputePilot.create( pilot_description=pilot_description, pilot_manager_obj=self) pilot_uid = self._worker.register_start_pilot_request( pilot=pilot, resource_config=resource_cfg) pilot._uid = pilot_uid pilot_obj_list.append(pilot) # Implicit return value conversion if return_list_type : return pilot_obj_list else: return pilot_obj_list[0]
def _unit_state_callback (self, unit, state) : try : with self.lock : uid = unit.uid logger.info ("[SchedulerCallback]: Computeunit %s changed to %s" % (uid, state)) found_unit = False if state in [NEW, UNSCHEDULED] : for pid in self.runqs : if not pid : logger.warning ('cannot handle final unit %s w/o pilot information' % uid) if uid in self.runqs[pid] : logger.info ('reschedule NEW unit %s from %s' % (uid, pid)) unit = self.runqs[pid][uid] found_unit = True del self.runqs[pid][uid] self.waitq[uid] = unit # self._dump ('before reschedule %s' % uid) self._reschedule (uid=uid) # self._dump ('after reschedule %s' % uid) return # if not found_unit and uid not in self.waitq : # # as we cannot unregister callbacks, we simply ignore this # # invokation. Its probably from a unit we handled previously. # # (although this should have been final?) # # # # FIXME: how can I *un*register a unit callback? # logger.error ("[SchedulerCallback]: cannot handle unit %s" % uid) # self._dump() # return if state in [PENDING_OUTPUT_STAGING, STAGING_OUTPUT, DONE, FAILED, CANCELED] : # the pilot which owned this CU should now have free slots available # FIXME: how do I get the pilot from the CU? pid = unit.execution_details.get ('pilot', None) if not pid : raise RuntimeError ('cannot handle final unit %s w/o pilot information' % uid) if pid not in self.pilots : logger.warning ('cannot handle unit %s cb for pilot %s (pilot is gone)' % (uid, pid)) else : if uid in self.runqs[pid] : unit = self.runqs[pid][uid] del self.runqs[pid][uid] self.pilots[pid]['caps'] += unit.description.cores self._reschedule (target_pid=pid) found_unit = True # logger.debug ('unit %s frees %s cores on (-> %s)' \ # % (uid, unit.description.cores, pid, self.pilots[pid]['caps'])) if not found_unit : logger.warn ('unit %s freed %s cores on %s (== %s) -- not reused' % (uid, unit.description.cores, pid, self.pilots[pid]['caps'])) except Exception as e : logger.error ("error in unit callback for backfiller (%s) - ignored" % e)
def _reschedule (self, target_pid=None, uid=None) : with self.lock : # dig through the list of waiting CUs, and try to find a pilot for each # of them. This enacts first-come-first-served, but will be unbalanced # if the units in the queue are of different sizes. That problem is # ignored at this point. # # if any units get scheduled, we push a dictionary to the UM to enact # the schedule: # { # unit_1: [pilot_id_1, pilot_resource_name] # unit_2: [pilot_id_2, pilot_resource_name] # unit_4: [pilot_id_2, pilot_resource_name] # ... # } if not len(self.pilots.keys ()) : # no pilots to work on, yet. logger.warning ("cannot schedule -- no pilots available") return if target_pid and target_pid not in self.pilots : logger.warning ("cannot schedule -- invalid target pilot %s" % target_pid) raise RuntimeError ("Invalid pilot (%s)" % target_pid) schedule = dict() schedule['units'] = dict() schedule['pilots'] = self.pilots logger.debug ("schedule (%s units waiting)" % len(self.waitq)) units_to_schedule = list() if uid : if uid not in self.waitq : # self._dump () logger.warning ("cannot schedule -- unknown unit %s" % uid) raise RuntimeError ("Invalid unit (%s)" % uid) units_to_schedule.append (self.waitq[uid]) else : # just copy the whole waitq for uid in self.waitq : units_to_schedule.append (self.waitq[uid]) for unit in units_to_schedule : uid = unit.uid ud = unit.description # sanity check on unit state if unit.state not in [NEW, SCHEDULING, UNSCHEDULED] : raise RuntimeError ("scheduler queue should only contain NEW or UNSCHEDULED units (%s)" % uid) # logger.debug ("examine unit %s (%s cores)" % (uid, ud.cores)) for pid in self.pilots : # logger.debug (" pilot %s (%s caps, state %s)" \ # % (pid, self.pilots[pid]['state'], self.pilots[pid]['caps'])) if self.pilots[pid]['state'] in [ACTIVE] : if ud.cores <= self.pilots[pid]['caps'] : # logger.debug (" unit %s fits on pilot %s" % (uid, pid)) self.pilots[pid]['caps'] -= ud.cores schedule['units'][unit] = pid # scheduled units are removed from the waitq del self.waitq[uid] self.runqs[pid][uid] = unit break # unit was not scheduled... schedule['units'][unit] = None # print a warning if a unit cannot possibly be scheduled, ever can_handle_unit = False for pid in self.pilots : if unit.description.cores <= self.pilots[pid]['cores'] : can_handle_unit=True break if not can_handle_unit : logger.warning ('cannot handle unit %s with current set of pilots' % uid) # pprint.pprint (schedule) # tell the UM about the schedule self.manager.handle_schedule (schedule)
def check_pilot_states(self, pilot_col): pending_pilots = pilot_col.find({ "pilotmanager": self.pilot_manager_id, "state": { "$in": [PENDING_ACTIVE, ACTIVE] } }) for pending_pilot in pending_pilots: pilot_failed = False pilot_done = False reconnected = False pilot_id = pending_pilot["_id"] log_message = "" saga_job_id = pending_pilot["saga_job_id"] logger.info( "Performing periodical health check for %s (SAGA job id %s)" % (str(pilot_id), saga_job_id)) if not pilot_id in self.missing_pilots: self.missing_pilots[pilot_id] = 0 # Create a job service object: try: js_url = saga_job_id.split("]-[")[0][1:] if js_url in self._shared_worker_data['job_services']: js = self._shared_worker_data['job_services'][js_url] else: js = saga.job.Service(js_url, session=self._session) self._shared_worker_data['job_services'][js_url] = js saga_job = js.get_job(saga_job_id) reconnected = True if saga_job.state in [saga.job.FAILED, saga.job.CANCELED]: pilot_failed = True log_message = "SAGA job state for ComputePilot %s is %s."\ % (pilot_id, saga_job.state) if saga_job.state in [saga.job.DONE]: pilot_done = True log_message = "SAGA job state for ComputePilot %s is %s."\ % (pilot_id, saga_job.state) except Exception as e: if not reconnected: logger.warning( 'could not reconnect to pilot for state check (%s)' % e) self.missing_pilots[pilot_id] += 1 if self.missing_pilots[pilot_id] >= JOB_CHECK_MAX_MISSES: logger.debug('giving up after 10 attempts') pilot_failed = True log_message = "Could not reconnect to pilot %s "\ "multiple times - giving up" % pilot_id else: logger.warning('pilot state check failed: %s' % e) pilot_failed = True log_message = "Couldn't determine job state for ComputePilot %s. " \ "Assuming it has failed." % pilot_id if pilot_failed: out, err, log = self._get_pilot_logs(pilot_col, pilot_id) ts = datetime.datetime.utcnow() pilot_col.update({ "_id": pilot_id, "state": { "$ne": DONE } }, { "$set": { "state": FAILED, "stdout": out, "stderr": err, "logfile": log }, "$push": { "statehistory": { "state": FAILED, "timestamp": ts }, "log": { "message": log_message, "timestamp": ts } } }) logger.debug(log_message) logger.warn('pilot %s declared dead' % pilot_id) elif pilot_done: # FIXME: this should only be done if the state is not yet # done... out, err, log = self._get_pilot_logs(pilot_col, pilot_id) ts = datetime.datetime.utcnow() pilot_col.update({ "_id": pilot_id, "state": { "$ne": DONE } }, { "$set": { "state": DONE, "stdout": out, "stderr": err, "logfile": log }, "$push": { "statehistory": { "state": DONE, "timestamp": ts }, "log": { "message": log_message, "timestamp": ts } } }) logger.debug(log_message) logger.warn('pilot %s declared dead' % pilot_id) else: if self.missing_pilots[pilot_id]: logger.info ('pilot %s *assumed* alive and well (%s)' \ % (pilot_id, self.missing_pilots[pilot_id])) else: logger.info ('pilot %s seems alive and well' \ % (pilot_id))
def check_pilot_states(self, pilot_col): pending_pilots = pilot_col.find( {"pilotmanager": self.pilot_manager_id, "state": {"$in": [PENDING_ACTIVE, ACTIVE]}} ) for pending_pilot in pending_pilots: pilot_failed = False pilot_done = False reconnected = False pilot_id = pending_pilot["_id"] log_message = "" saga_job_id = pending_pilot["saga_job_id"] logger.info("Performing periodical health check for %s (SAGA job id %s)" % (str(pilot_id), saga_job_id)) if not pilot_id in self.missing_pilots: self.missing_pilots[pilot_id] = 0 # Create a job service object: try: js_url = saga_job_id.split("]-[")[0][1:] if js_url in self._shared_worker_data["job_services"]: js = self._shared_worker_data["job_services"][js_url] else: js = saga.job.Service(js_url, session=self._session) self._shared_worker_data["job_services"][js_url] = js saga_job = js.get_job(saga_job_id) reconnected = True if saga_job.state in [saga.job.FAILED, saga.job.CANCELED]: pilot_failed = True log_message = "SAGA job state for ComputePilot %s is %s." % (pilot_id, saga_job.state) if saga_job.state in [saga.job.DONE]: pilot_done = True log_message = "SAGA job state for ComputePilot %s is %s." % (pilot_id, saga_job.state) except Exception as e: if not reconnected: logger.warning("could not reconnect to pilot for state check (%s)" % e) self.missing_pilots[pilot_id] += 1 if self.missing_pilots[pilot_id] >= JOB_CHECK_MAX_MISSES: logger.debug("giving up after 10 attempts") pilot_failed = True log_message = "Could not reconnect to pilot %s " "multiple times - giving up" % pilot_id else: logger.warning("pilot state check failed: %s" % e) pilot_failed = True log_message = ( "Couldn't determine job state for ComputePilot %s. " "Assuming it has failed." % pilot_id ) if pilot_failed: out, err, log = self._get_pilot_logs(pilot_col, pilot_id) ts = datetime.datetime.utcnow() pilot_col.update( {"_id": pilot_id, "state": {"$ne": DONE}}, { "$set": {"state": FAILED, "stdout": out, "stderr": err, "logfile": log}, "$push": { "statehistory": {"state": FAILED, "timestamp": ts}, "log": {"message": log_message, "timestamp": ts}, }, }, ) logger.debug(log_message) logger.warn("pilot %s declared dead" % pilot_id) elif pilot_done: # FIXME: this should only be done if the state is not yet # done... out, err, log = self._get_pilot_logs(pilot_col, pilot_id) ts = datetime.datetime.utcnow() pilot_col.update( {"_id": pilot_id, "state": {"$ne": DONE}}, { "$set": {"state": DONE, "stdout": out, "stderr": err, "logfile": log}, "$push": { "statehistory": {"state": DONE, "timestamp": ts}, "log": {"message": log_message, "timestamp": ts}, }, }, ) logger.debug(log_message) logger.warn("pilot %s declared dead" % pilot_id) else: if self.missing_pilots[pilot_id]: logger.info("pilot %s *assumed* alive and well (%s)" % (pilot_id, self.missing_pilots[pilot_id])) else: logger.info("pilot %s seems alive and well" % (pilot_id))