def get_compute_pilot_data(self, pilot_ids=None): """Returns the raw data (json dicts) of one or more ComputePilots registered with this Worker / PilotManager """ # Wait for the initialized event to assert proper operation. self._initialized.wait() try: if pilot_ids is None: pilot_ids = self._shared_data.keys () return_list_type = True if not isinstance(pilot_ids, list): return_list_type = False pilot_ids = [pilot_ids] data = list() for pilot_id in pilot_ids: data.append(self._shared_data[pilot_id]['data']) if return_list_type : return data else : return data[0] except KeyError as e: logger.exception ("Unknown Pilot ID %s : %s" % (pilot_id, e)) raise
def call_unit_state_callbacks(self, unit_id, new_state): """Wrapper function to call all all relevant callbacks, on unit-level as well as manager-level. """ # this is the point where, at the earliest, the application could have # been notified about unit state changes. So we record that event. if not unit_id in self._callback_histories: self._callback_histories[unit_id] = list() self._callback_histories[unit_id].append({ 'timestamp': datetime.datetime.utcnow(), 'state': new_state }) for [cb, cb_data] in self._shared_data[unit_id]['callbacks']: try: if self._shared_data[unit_id]['facade_object']: if cb_data: cb(self._shared_data[unit_id]['facade_object'], new_state, cb_data) else: cb(self._shared_data[unit_id]['facade_object'], new_state) else: logger.error("Couldn't call callback (no pilot instance)") except Exception as e: logger.exception("Couldn't call callback function %s" % e) raise # If we have any manager-level callbacks registered, we # call those as well! if not UNIT_STATE in self._manager_callbacks: self._manager_callbacks[UNIT_STATE] = list() for [cb, cb_data] in self._manager_callbacks[UNIT_STATE]: if not self._shared_data[unit_id]['facade_object']: logger.warning('skip cb for incomple unit (%s: %s)' % (unit_id, new_state)) break try: if cb_data: cb(self._shared_data[unit_id]['facade_object'], new_state, cb_data) else: cb(self._shared_data[unit_id]['facade_object'], new_state) except Exception as e: logger.exception("Couldn't call callback function %s" % e) raise # If we meet a final state, we record the object's callback history for # later evaluation. if new_state in (DONE, FAILED, CANCELED): self._db.publish_compute_unit_callback_history( unit_id, self._callback_histories[unit_id])
def unschedule_compute_units(self, units): """ set the unit state to UNSCHEDULED """ try: unit_ids = [unit.uid for unit in units] self._db.set_compute_unit_state(unit_ids, UNSCHEDULED, "unit remains unscheduled") except Exception, e: logger.exception ('error in unit manager controller (unschedule())') raise
def call_unit_state_callbacks(self, unit_id, new_state): """Wrapper function to call all all relevant callbacks, on unit-level as well as manager-level. """ # this is the point where, at the earliest, the application could have # been notified about unit state changes. So we record that event. if not unit_id in self._callback_histories : self._callback_histories[unit_id] = list() self._callback_histories[unit_id].append ( {'timestamp' : datetime.datetime.utcnow(), 'state' : new_state}) for [cb, cb_data] in self._shared_data[unit_id]['callbacks']: try: if self._shared_data[unit_id]['facade_object'] : if cb_data : cb(self._shared_data[unit_id]['facade_object'], new_state, cb_data) else : cb(self._shared_data[unit_id]['facade_object'], new_state) else : logger.error("Couldn't call callback (no pilot instance)") except Exception as e: logger.exception( "Couldn't call callback function %s" % e) raise # If we have any manager-level callbacks registered, we # call those as well! if not UNIT_STATE in self._manager_callbacks : self._manager_callbacks[UNIT_STATE] = list() for [cb, cb_data] in self._manager_callbacks[UNIT_STATE]: if not self._shared_data[unit_id]['facade_object'] : logger.warning ('skip cb for incomple unit (%s: %s)' % (unit_id, new_state)) break try: if cb_data : cb(self._shared_data[unit_id]['facade_object'], new_state, cb_data) else : cb(self._shared_data[unit_id]['facade_object'], new_state) except Exception as e: logger.exception( "Couldn't call callback function %s" % e) raise # If we meet a final state, we record the object's callback history for # later evaluation. if new_state in (DONE, FAILED, CANCELED) : self._db.publish_compute_unit_callback_history (unit_id, self._callback_histories[unit_id])
def unschedule_compute_units(self, units): """ set the unit state to UNSCHEDULED """ try: unit_ids = [unit.uid for unit in units] self._db.set_compute_unit_state(unit_ids, UNSCHEDULED, "unit remains unscheduled") except Exception, e: logger.exception('error in unit manager controller (unschedule())') raise
def fire_manager_callback(self, metric, obj, value): """Fire a manager-level callback. """ if not metric in self._manager_callbacks : self._manager_callbacks[metric] = list() for [cb, cb_data] in self._manager_callbacks[metric] : try: if cb_data : cb (obj, value, cb_data) else : cb (obj, value) except Exception as e: logger.exception ("Couldn't call '%s' callback function %s: %s" \ % (metric, cb, e)) raise
def fire_manager_callback(self, metric, obj, value): """Fire a manager-level callback. """ if not metric in self._manager_callbacks: self._manager_callbacks[metric] = list() for [cb, cb_data] in self._manager_callbacks[metric]: try: if cb_data: cb(obj, value, cb_data) else: cb(obj, value) except Exception as e: logger.exception ("Couldn't call '%s' callback function %s: %s" \ % (metric, cb, e)) raise
def submit_units(self, unit_descriptions): """Submits on or more :class:`radical.pilot.ComputeUnit` instances to the unit manager. **Arguments:** * **unit_descriptions** [:class:`radical.pilot.ComputeUnitDescription` or list of :class:`radical.pilot.ComputeUnitDescription`]: The description of the compute unit instance(s) to create. **Returns:** * A list of :class:`radical.pilot.ComputeUnit` objects. **Raises:** * :class:`radical.pilot.PilotException` """ if not self._uid: raise IncorrectState(msg="Invalid object instance.") return_list_type = True if not isinstance(unit_descriptions, list): return_list_type = False unit_descriptions = [unit_descriptions] # we return a list of compute units ret = list() # the scheduler will return a dictionary of the form: # { # ud_1 : pilot_id_a, # ud_2 : pilot_id_b # ... # } # # The scheduler may not be able to schedule some units - those will # have 'None' as pilot ID. units = list() for ud in unit_descriptions: units.append( ComputeUnit.create(unit_description=ud, unit_manager_obj=self, local_state=SCHEDULING)) self._worker.publish_compute_units(units=units) schedule = None try: schedule = self._scheduler.schedule(units=units) except Exception as e: logger.exception("Internal error - unit scheduler failed") raise self.handle_schedule(schedule) if return_list_type: return units else: return units[0]
def __init__(self, database_url=None, database_name="radicalpilot", uid=None, name=None): """Creates a new or reconnects to an exising session. If called without a uid, a new Session instance is created and stored in the database. If uid is set, an existing session is retrieved from the database. **Arguments:** * **database_url** (`string`): The MongoDB URL. If none is given, RP uses the environment variable RADICAL_PILOT_DBURL. If that is not set, an error will be raises. * **database_name** (`string`): An alternative database name (default: 'radicalpilot'). * **uid** (`string`): If uid is set, we try re-connect to an existing session instead of creating a new one. * **name** (`string`): An optional human readable name. **Returns:** * A new Session instance. **Raises:** * :class:`radical.pilot.DatabaseError` """ # init the base class inits saga.Session.__init__(self) Object.__init__(self) # before doing anything else, set up the debug helper for the lifetime # of the session. self._debug_helper = ru.DebugHelper() # Dictionaries holding all manager objects created during the session. self._pilot_manager_objects = list() self._unit_manager_objects = list() # Create a new process registry. All objects belonging to this # session will register their worker processes (if they have any) # in this registry. This makes it easier to shut down things in # a more coordinate fashion. self._process_registry = _ProcessRegistry() # The resource configuration dictionary associated with the session. self._resource_configs = {} self._database_url = database_url self._database_name = database_name if not self._database_url: self._database_url = os.getenv("RADICAL_PILOT_DBURL", None) if not self._database_url: raise PilotException("no database URL (set RADICAL_PILOT_DBURL)") logger.info("using database url %s" % self._database_url) # if the database url contains a path element, we interpret that as # database name (without the leading slash) tmp_url = ru.Url(self._database_url) if tmp_url.path and \ tmp_url.path[0] == '/' and \ len(tmp_url.path) > 1 : self._database_name = tmp_url.path[1:] logger.info("using database path %s" % self._database_name) else: logger.info("using database name %s" % self._database_name) # Loading all "default" resource configurations module_path = os.path.dirname(os.path.abspath(__file__)) default_cfgs = "%s/configs/*.json" % module_path config_files = glob.glob(default_cfgs) for config_file in config_files: try: rcs = ResourceConfig.from_file(config_file) except Exception as e: logger.error("skip config file %s: %s" % (config_file, e)) continue for rc in rcs: logger.info("Loaded resource configurations for %s" % rc) self._resource_configs[rc] = rcs[rc].as_dict() user_cfgs = "%s/.radical/pilot/configs/*.json" % os.environ.get('HOME') config_files = glob.glob(user_cfgs) for config_file in config_files: try: rcs = ResourceConfig.from_file(config_file) except Exception as e: logger.error("skip config file %s: %s" % (config_file, e)) continue for rc in rcs: logger.info("Loaded resource configurations for %s" % rc) if rc in self._resource_configs: # config exists -- merge user config into it ru.dict_merge(self._resource_configs[rc], rcs[rc].as_dict(), policy='overwrite') else: # new config -- add as is self._resource_configs[rc] = rcs[rc].as_dict() default_aliases = "%s/configs/aliases.json" % module_path self._resource_aliases = ru.read_json_str(default_aliases)['aliases'] ########################## ## CREATE A NEW SESSION ## ########################## if uid is None: try: self._connected = None if name: self._name = name self._uid = name # self._uid = ru.generate_id ('rp.session.'+name+'.%(item_counter)06d', mode=ru.ID_CUSTOM) else: self._uid = ru.generate_id('rp.session', mode=ru.ID_PRIVATE) self._name = self._uid self._dbs, self._created, self._connection_info = \ dbSession.new(sid = self._uid, name = self._name, db_url = self._database_url, db_name = database_name) logger.info("New Session created%s." % str(self)) except Exception, ex: logger.exception('session create failed') raise PilotException("Couldn't create new session (database URL '%s' incorrect?): %s" \ % (self._database_url, ex))
def _pilot_state_callback (self, pilot, state) : try : with self.lock : pid = pilot.uid if not pid in self.pilots : # as we cannot unregister callbacks, we simply ignore this # invokation. Its probably from a pilot we used previously. logger.warn ("[SchedulerCallback]: ComputePilot %s changed to %s (ignored)" % (pid, state)) return self.pilots[pid]['state'] = state logger.debug ("[SchedulerCallback]: ComputePilot %s changed to %s" % (pid, state)) if state in [ACTIVE] : # the pilot is now ready to be used self._reschedule (target_pid=pid) if state in [DONE, FAILED, CANCELED] : # self._dump ('pilot is final') # If the pilot state is 'DONE', 'FAILED' or 'CANCELED', we # need to reschedule the units which are reschedulable -- # all others are marked 'FAILED' if they are already # 'EXECUTING' and not restartable timestamp = datetime.datetime.utcnow() self._db.change_compute_units ( filter_dict = {"pilot" : pid, "state" : {"$in": [UNSCHEDULED, PENDING_INPUT_STAGING, STAGING_INPUT, PENDING_EXECUTION, SCHEDULING]}}, set_dict = {"state" : UNSCHEDULED, "pilot" : None}, push_dict = {"statehistory": {"state" : UNSCHEDULED, "timestamp" : timestamp}, "log" : {"message" : "reschedule unit", "timestamp" : timestamp} }) self._db.change_compute_units ( filter_dict = {"pilot" : pid, "restartable" : True, "state" : {"$in": [EXECUTING, PENDING_OUTPUT_STAGING, STAGING_OUTPUT]}}, set_dict = {"state" : UNSCHEDULED, "pilot" : None}, push_dict = {"statehistory": {"state" : UNSCHEDULED, "timestamp" : timestamp}, "log" : {"message" : "reschedule unit", "timestamp" : timestamp} }) self._db.change_compute_units ( filter_dict = {"pilot" : pid, "restartable" : False, "state" : {"$in": [EXECUTING, PENDING_OUTPUT_STAGING, STAGING_OUTPUT]}}, set_dict = {"state" : FAILED}, push_dict = {"statehistory": {"state" : FAILED, "timestamp" : timestamp}, "log" : {"message" : "reschedule unit", "timestamp" : timestamp} }) # make sure that restartable units got back into the # wait queue # # FIXME AM: f*****g state management: I don't have the # unit state! New state was just pushed to the DB, but # I have actually no idea for which units, and the state # known to the worker (i.e. the cached state) is most # likely outdated. # # So we don't handle runq/waitq here. Instead, we rely # on the unit cb to get invoked as soon as the state # propagated back to us, and then remove them from the # runq. This is slow, potentially very slow, but save. # we can't use this pilot anymore... del self.pilots[pid] # FIXME: how can I *un*register a pilot callback? except Exception as e : # import traceback # traceback.print_exc () logger.exception ("error in pilot callback for backfiller (%s) - ignored" % e) raise
def __init__ (self, database_url=None, database_name="radicalpilot", uid=None, name=None): """Creates a new or reconnects to an exising session. If called without a uid, a new Session instance is created and stored in the database. If uid is set, an existing session is retrieved from the database. **Arguments:** * **database_url** (`string`): The MongoDB URL. If none is given, RP uses the environment variable RADICAL_PILOT_DBURL. If that is not set, an error will be raises. * **database_name** (`string`): An alternative database name (default: 'radicalpilot'). * **uid** (`string`): If uid is set, we try re-connect to an existing session instead of creating a new one. * **name** (`string`): An optional human readable name. **Returns:** * A new Session instance. **Raises:** * :class:`radical.pilot.DatabaseError` """ # init the base class inits saga.Session.__init__ (self) Object.__init__ (self) # before doing anything else, set up the debug helper for the lifetime # of the session. self._debug_helper = ru.DebugHelper () # Dictionaries holding all manager objects created during the session. self._pilot_manager_objects = list() self._unit_manager_objects = list() # Create a new process registry. All objects belonging to this # session will register their worker processes (if they have any) # in this registry. This makes it easier to shut down things in # a more coordinate fashion. self._process_registry = _ProcessRegistry() # The resource configuration dictionary associated with the session. self._resource_configs = {} self._database_url = database_url self._database_name = database_name if not self._database_url : self._database_url = os.getenv ("RADICAL_PILOT_DBURL", None) if not self._database_url : raise PilotException ("no database URL (set RADICAL_PILOT_DBURL)") logger.info("using database url %s" % self._database_url) # if the database url contains a path element, we interpret that as # database name (without the leading slash) tmp_url = ru.Url (self._database_url) if tmp_url.path and \ tmp_url.path[0] == '/' and \ len(tmp_url.path) > 1 : self._database_name = tmp_url.path[1:] logger.info("using database path %s" % self._database_name) else : logger.info("using database name %s" % self._database_name) # Loading all "default" resource configurations module_path = os.path.dirname(os.path.abspath(__file__)) default_cfgs = "%s/configs/*.json" % module_path config_files = glob.glob(default_cfgs) for config_file in config_files: try : rcs = ResourceConfig.from_file(config_file) except Exception as e : logger.error ("skip config file %s: %s" % (config_file, e)) continue for rc in rcs: logger.info("Loaded resource configurations for %s" % rc) self._resource_configs[rc] = rcs[rc].as_dict() user_cfgs = "%s/.radical/pilot/configs/*.json" % os.environ.get ('HOME') config_files = glob.glob(user_cfgs) for config_file in config_files: try : rcs = ResourceConfig.from_file(config_file) except Exception as e : logger.error ("skip config file %s: %s" % (config_file, e)) continue for rc in rcs: logger.info("Loaded resource configurations for %s" % rc) if rc in self._resource_configs : # config exists -- merge user config into it ru.dict_merge (self._resource_configs[rc], rcs[rc].as_dict(), policy='overwrite') else : # new config -- add as is self._resource_configs[rc] = rcs[rc].as_dict() default_aliases = "%s/configs/aliases.json" % module_path self._resource_aliases = ru.read_json_str (default_aliases)['aliases'] ########################## ## CREATE A NEW SESSION ## ########################## if uid is None: try: self._connected = None if name : self._name = name self._uid = name # self._uid = ru.generate_id ('rp.session.'+name+'.%(item_counter)06d', mode=ru.ID_CUSTOM) else : self._uid = ru.generate_id ('rp.session', mode=ru.ID_PRIVATE) self._name = self._uid self._dbs, self._created, self._connection_info = \ dbSession.new(sid = self._uid, name = self._name, db_url = self._database_url, db_name = database_name) logger.info("New Session created%s." % str(self)) except Exception, ex: logger.exception ('session create failed') raise PilotException("Couldn't create new session (database URL '%s' incorrect?): %s" \ % (self._database_url, ex))
def schedule_compute_units(self, pilot_uid, units): """Request the scheduling of one or more ComputeUnits on a ComputePilot. """ try: cu_transfer = list() cu_notransfer = list() # Get some information about the pilot sandbox from the database. pilot_info = self._db.get_pilots(pilot_ids=pilot_uid) # TODO: this hack below relies on what?! That there is just one pilot? pilot_sandbox = pilot_info[0]['sandbox'] # Split units into two different lists: the first list contains the CUs # that need file transfer and the second list contains the CUs that # don't. The latter is added to the pilot directly, while the former # is added to the transfer queue. for unit in units: # Create object for staging status tracking unit.FTW_Input_Status = None unit.FTW_Input_Directives = [] unit.Agent_Input_Status = None unit.Agent_Input_Directives = [] unit.FTW_Output_Status = None unit.FTW_Output_Directives = [] unit.Agent_Output_Status = None unit.Agent_Output_Directives = [] # Split the input staging directives over the transfer worker and the agent input_sds = unit.description.input_staging if not isinstance(input_sds, list): # Ugly, but is a workaround for iterating on attribute interface # TODO: Verify if this piece of code is actually still required if input_sds: input_sds = [input_sds] else: input_sds = [] for input_sd_entry in input_sds: action = input_sd_entry['action'] source = Url(input_sd_entry['source']) target = Url(input_sd_entry['target']) new_sd = {'action': action, 'source': str(source), 'target': str(target), 'flags': input_sd_entry['flags'], 'priority': input_sd_entry['priority'], 'state': PENDING } if action in [LINK, COPY, MOVE]: unit.Agent_Input_Directives.append(new_sd) unit.Agent_Input_Status = PENDING elif action in [TRANSFER]: if source.scheme and source.scheme != 'file': # If there is a scheme and it is different than "file", # assume a remote pull from the agent unit.Agent_Input_Directives.append(new_sd) unit.Agent_Input_Status = PENDING else: # Transfer from local to sandbox unit.FTW_Input_Directives.append(new_sd) unit.FTW_Input_Status = PENDING else: logger.warn('Not sure if action %s makes sense for input staging' % action) # Split the output staging directives over the transfer worker and the agent output_sds = unit.description.output_staging if not isinstance(output_sds, list): # Ugly, but is a workaround for iterating on att iface # TODO: Verify if this piece of code is actually still required if output_sds: output_sds = [output_sds] else: output_sds = [] for output_sds_entry in output_sds: action = output_sds_entry['action'] source = Url(output_sds_entry['source']) target = Url(output_sds_entry['target']) new_sd = {'action': action, 'source': str(source), 'target': str(target), 'flags': output_sds_entry['flags'], 'priority': output_sds_entry['priority'], 'state': PENDING } if action == LINK or action == COPY or action == MOVE: unit.Agent_Output_Directives.append(new_sd) unit.Agent_Output_Status = NEW elif action == TRANSFER: if target.scheme and target.scheme != 'file': # If there is a scheme and it is different than "file", # assume a remote push from the agent unit.Agent_Output_Directives.append(new_sd) unit.Agent_Output_Status = NEW else: # Transfer from sandbox back to local unit.FTW_Output_Directives.append(new_sd) unit.FTW_Output_Status = NEW else: logger.warn('Not sure if action %s makes sense for output staging' % action) if unit.FTW_Input_Directives or unit.Agent_Input_Directives: log = "Scheduled for data transfer to ComputePilot %s." % pilot_uid self._db.set_compute_unit_state(unit.uid, PENDING_INPUT_STAGING, log) cu_transfer.append(unit) else: cu_notransfer.append(unit) # Bulk-add all non-transfer units- self._db.assign_compute_units_to_pilot( units=cu_notransfer, pilot_uid=pilot_uid, pilot_sandbox=pilot_sandbox ) self._db.assign_compute_units_to_pilot( units=cu_transfer, pilot_uid=pilot_uid, pilot_sandbox=pilot_sandbox ) for unit in cu_notransfer: log = "Scheduled for execution on ComputePilot %s." % pilot_uid self._db.set_compute_unit_state(unit.uid, PENDING_EXECUTION, log) #self._set_state(uid, PENDING_EXECUTION, log) logger.info( "Scheduled ComputeUnits %s for execution on ComputePilot '%s'." % (cu_notransfer, pilot_uid) ) except Exception, e: logger.exception ('error in unit manager controller (schedule())') raise
def run(self): """Starts the process when Process.start() is called. """ # make sure to catch sys.exit (which raises SystemExit) try : logger.info("Starting InputFileTransferWorker") # Try to connect to the database and create a tailable cursor. try: connection = self.db_connection_info.get_db_handle() db = connection[self.db_connection_info.dbname] um_col = db["%s.cu" % self.db_connection_info.session_id] logger.debug("Connected to MongoDB. Serving requests for UnitManager %s." % self.unit_manager_id) except Exception as e : logger.exception("Connection error: %s" % e) raise try : while not self._stop.is_set(): # See if we can find a ComputeUnit that is waiting for # input file transfer. compute_unit = None ts = datetime.datetime.utcnow() compute_unit = um_col.find_and_modify( query={"unitmanager": self.unit_manager_id, "FTW_Input_Status": PENDING}, update={"$set" : {"FTW_Input_Status": EXECUTING, "state": STAGING_INPUT}, "$push": {"statehistory": {"state": STAGING_INPUT, "timestamp": ts}}}, limit=BULK_LIMIT # TODO: bulklimit is probably not the best way to ensure there is just one ) # FIXME: AM: find_and_modify is not bulkable! state = STAGING_INPUT if compute_unit is None: # Sleep a bit if no new units are available. time.sleep(IDLE_TIME) else: compute_unit_id = None try: log_messages = [] # We have found a new CU. Now we can process the transfer # directive(s) wit SAGA. compute_unit_id = str(compute_unit["_id"]) remote_sandbox = compute_unit["sandbox"] input_staging = compute_unit["FTW_Input_Directives"] # We need to create the CU's directory in case it doesn't exist yet. log_msg = "Creating ComputeUnit sandbox directory %s." % remote_sandbox log_messages.append(log_msg) logger.info(log_msg) # Creating the sandbox directory. try: logger.debug ("saga.fs.Directory ('%s')" % remote_sandbox) remote_sandbox_keyurl = saga.Url (remote_sandbox) remote_sandbox_keyurl.path = '/' remote_sandbox_key = str(remote_sandbox_keyurl) if remote_sandbox_key not in self._saga_dirs : self._saga_dirs[remote_sandbox_key] = \ saga.filesystem.Directory (remote_sandbox_key, flags=saga.filesystem.CREATE_PARENTS, session=self._session) saga_dir = self._saga_dirs[remote_sandbox_key] saga_dir.make_dir (remote_sandbox, flags=saga.filesystem.CREATE_PARENTS) except Exception as e : logger.exception('Error: %s' % e) # FIXME: why is this exception ignored? AM logger.info("Processing input file transfers for ComputeUnit %s" % compute_unit_id) # Loop over all transfer directives and execute them. for sd in input_staging: state_doc = um_col.find_one( {"_id": compute_unit_id}, fields=["state"] ) if state_doc['state'] == CANCELED: logger.info("Compute Unit Canceled, interrupting input file transfers.") state = CANCELED break abs_src = os.path.abspath(sd['source']) input_file_url = saga.Url("file://localhost/%s" % abs_src) if not sd['target']: target = remote_sandbox else: target = "%s/%s" % (remote_sandbox, sd['target']) log_msg = "Transferring input file %s -> %s" % (input_file_url, target) log_messages.append(log_msg) logger.debug(log_msg) # Execute the transfer. logger.debug ("saga.fs.File ('%s')" % input_file_url) input_file = saga.filesystem.File( input_file_url, session=self._session ) if CREATE_PARENTS in sd['flags']: copy_flags = saga.filesystem.CREATE_PARENTS else: copy_flags = 0 try : input_file.copy(target, flags=copy_flags) except Exception as e : logger.exception (e) input_file.close() # If all went fine, update the state of this StagingDirective to Done um_col.find_and_modify( query={"_id" : compute_unit_id, 'FTW_Input_Status': EXECUTING, 'FTW_Input_Directives.state': PENDING, 'FTW_Input_Directives.source': sd['source'], 'FTW_Input_Directives.target': sd['target'], }, update={'$set': {'FTW_Input_Directives.$.state': 'Done'}, '$push': {'log': { 'timestamp': datetime.datetime.utcnow(), 'message' : log_msg}} } ) except Exception as e : # Update the CU's state 'FAILED'. ts = datetime.datetime.utcnow() logentry = {'message' : "Input transfer failed: %s" % e, 'timestamp': ts} um_col.update({'_id': compute_unit_id}, { '$set': {'state': FAILED}, '$push': { 'statehistory': {'state': FAILED, 'timestamp': ts}, 'log': logentry } }) logger.exception(str(logentry)) # Code below is only to be run by the "first" or only worker if self._worker_number > 1: continue # If the CU was canceled we can skip the remainder of this loop. if state == CANCELED: continue # # Check to see if there are more pending Directives, if not, we are Done # cursor_w = um_col.find({"unitmanager": self.unit_manager_id, "$or": [ {"Agent_Input_Status": EXECUTING}, {"FTW_Input_Status": EXECUTING} ] } ) # Iterate over all the returned CUs (if any) for cu in cursor_w: # See if there are any FTW Input Directives still pending if cu['FTW_Input_Status'] == EXECUTING and \ not any(d['state'] == EXECUTING or d['state'] == PENDING for d in cu['FTW_Input_Directives']): # All Input Directives for this FTW are done, mark the CU accordingly um_col.update({"_id": cu["_id"]}, {'$set': {'FTW_Input_Status': DONE}, '$push': {'log': { 'timestamp': datetime.datetime.utcnow(), 'message' : 'All FTW Input Staging Directives done - %d.' % self._worker_number}} } ) # See if there are any Agent Input Directives still pending or executing, # if not, mark it DONE. if cu['Agent_Input_Status'] == EXECUTING and \ not any(d['state'] == EXECUTING or d['state'] == PENDING for d in cu['Agent_Input_Directives']): # All Input Directives for this Agent are done, mark the CU accordingly um_col.update({"_id": cu["_id"]}, {'$set': {'Agent_Input_Status': DONE}, '$push': {'log': { 'timestamp': datetime.datetime.utcnow(), 'message' : 'All Agent Input Staging Directives done - %d.' % self._worker_number}} } ) # # Check for all CUs if both Agent and FTW staging is done, we can then mark the CU PendingExecution # ts = datetime.datetime.utcnow() um_col.find_and_modify( query={"unitmanager": self.unit_manager_id, "Agent_Input_Status": { "$in": [ None, DONE ] }, "FTW_Input_Status": { "$in": [ None, DONE ] }, "state": STAGING_INPUT }, update={"$set": { "state": PENDING_EXECUTION }, "$push": { "statehistory": {"state": PENDING_EXECUTION, "timestamp": ts} } } ) except Exception as e : logger.exception("transfer worker error: %s" % e) self._session.close (cleanup=False) raise except SystemExit as e : logger.debug("input file transfer thread caught system exit -- forcing application shutdown") import thread thread.interrupt_main ()
def submit_units(self, unit_descriptions): """Submits on or more :class:`radical.pilot.ComputeUnit` instances to the unit manager. **Arguments:** * **unit_descriptions** [:class:`radical.pilot.ComputeUnitDescription` or list of :class:`radical.pilot.ComputeUnitDescription`]: The description of the compute unit instance(s) to create. **Returns:** * A list of :class:`radical.pilot.ComputeUnit` objects. **Raises:** * :class:`radical.pilot.PilotException` """ if not self._uid: raise IncorrectState(msg="Invalid object instance.") return_list_type = True if not isinstance(unit_descriptions, list): return_list_type = False unit_descriptions = [unit_descriptions] # we return a list of compute units ret = list() # the scheduler will return a dictionary of the form: # { # ud_1 : pilot_id_a, # ud_2 : pilot_id_b # ... # } # # The scheduler may not be able to schedule some units - those will # have 'None' as pilot ID. units = list() for ud in unit_descriptions : units.append (ComputeUnit.create (unit_description=ud, unit_manager_obj=self, local_state=SCHEDULING)) self._worker.publish_compute_units (units=units) schedule = None try: schedule = self._scheduler.schedule (units=units) except Exception as e: logger.exception ("Internal error - unit scheduler failed") raise self.handle_schedule (schedule) if return_list_type : return units else : return units[0]
def run(self): """run() is called when the process is started via PilotManagerController.start(). """ # make sure to catch sys.exit (which raises SystemExit) try : logger.debug("Worker thread (ID: %s[%s]) for PilotManager %s started." % (self.name, self.ident, self._pm_id)) while not self._stop.is_set(): # # Check if one or more startup requests have finished. # self.startup_results_lock.acquire() # new_startup_results = list() # for transfer_result in self.startup_results: # if transfer_result.ready(): # result = transfer_result.get() # self._db.update_pilot_state( # pilot_uid=result["pilot_uid"], # state=result["state"], # sagajobid=result["saga_job_id"], # pilot_sandbox=result["sandbox"], # global_sandbox=result["global_sandbox"], # submitted=result["submitted"], # logs=result["logs"] # ) # else: # new_startup_results.append(transfer_result) # self.startup_results = new_startup_results # self.startup_results_lock.release() # Check and update pilots. This needs to be optimized at # some point, i.e., state pulling should be conditional # or triggered by a tailable MongoDB cursor, etc. pilot_list = self._db.get_pilots(pilot_manager_id=self._pm_id) action = False for pilot in pilot_list: pilot_id = str(pilot["_id"]) new_state = pilot["state"] if pilot_id in self._shared_data: old_state = self._shared_data[pilot_id]["data"]["state"] else: old_state = None self._shared_data[pilot_id] = { 'data': pilot, 'callbacks': [], 'facade_object': None } self._shared_data[pilot_id]['data'] = pilot # FIXME: *groan* what a hack... The Canceling state is by # the nature of it not recorded in the database, but only in # the local cache. So if we see it as old state, we have to # avoid state transitions into non-final states in the cache # at all cost -- so we catch this here specifically no_cb = False if old_state == CANCELING : if new_state not in [DONE, FAILED, CANCELED] : # restore old state, making the cache explicitly # different than the DB recorded state self._shared_data[pilot_id]["data"]["state"] = old_state # do not tr igger a state cb! no_cb = True if new_state != old_state : action = True if not no_cb : # On a state change, we fire zee callbacks. logger.info("ComputePilot '%s' state changed from '%s' to '%s'." \ % (pilot_id, old_state, new_state)) # The state of the pilot has changed, We call all # pilot-level callbacks to propagate this. This also # includes communication to the unit scheduler which # may, or may not, cancel the pilot's units. self.call_callbacks(pilot_id, new_state) # If the state is 'DONE', 'FAILED' or 'CANCELED', we also # set the state of the compute unit accordingly (but only # for non-final units) if new_state in [FAILED, DONE, CANCELED]: unit_ids = self._db.pilot_list_compute_units(pilot_uid=pilot_id) self._db.set_compute_unit_state ( unit_ids=unit_ids, state=CANCELED, src_states=[ PENDING_INPUT_STAGING, STAGING_INPUT, PENDING_EXECUTION, SCHEDULING, EXECUTING, PENDING_OUTPUT_STAGING, STAGING_OUTPUT ], log="Pilot '%s' has terminated with state '%s'. CU canceled." % (pilot_id, new_state)) # After the first iteration, we are officially initialized! if not self._initialized.is_set(): self._initialized.set() # sleep a little if this cycle was idle if not action : time.sleep(IDLE_TIME) except SystemExit as e : logger.exception ("pilot manager controller thread caught system exit -- forcing application shutdown") import thread thread.interrupt_main () finally : # shut down the autonomous pilot launcher worker(s) for worker in self._pilot_launcher_worker_pool: logger.debug("pworker %s stops launcher %s" % (self.name, worker.name)) worker.stop () logger.debug("pworker %s stopped launcher %s" % (self.name, worker.name))
def run(self): """Starts the process when Process.start() is called. """ # make sure to catch sys.exit (which raises SystemExit) try: # Get directory where this module lives mod_dir = os.path.dirname(os.path.realpath(__file__)) # Try to connect to the database try: connection = self.db_connection_info.get_db_handle() db = connection[self.db_connection_info.dbname] pilot_col = db["%s.p" % self.db_connection_info.session_id] logger.debug( "Connected to MongoDB. Serving requests for PilotManager %s." % self.pilot_manager_id) except Exception as e: logger.exception("Connection error: %s" % e) return last_job_check = time.time() while not self._stop.is_set(): # Periodically, we pull up all ComputePilots that are pending # execution or were last seen executing and check if the corresponding # SAGA job is still pending in the queue. If that is not the case, # we assume that the job has failed for some reasons and update # the state of the ComputePilot accordingly. if last_job_check + JOB_CHECK_INTERVAL < time.time(): last_job_check = time.time() self.check_pilot_states(pilot_col) # See if we can find a ComputePilot that is waiting to be launched. # If we find one, we use SAGA to create a job service, a job # description and a job that is then send to the local or remote # queueing system. If this succedes, we set the ComputePilot's # state to pending, otherwise to failed. compute_pilot = None ts = datetime.datetime.utcnow() compute_pilot = pilot_col.find_and_modify( query={ "pilotmanager": self.pilot_manager_id, "state": PENDING_LAUNCH }, update={ "$set": { "state": LAUNCHING }, "$push": { "statehistory": { "state": LAUNCHING, "timestamp": ts } } }) if not compute_pilot: time.sleep(IDLE_TIMER) else: try: # ------------------------------------------------------ # # LAUNCH THE PILOT AGENT VIA SAGA # logentries = [] pilot_id = str(compute_pilot["_id"]) logger.info("Launching ComputePilot %s" % pilot_id) # ------------------------------------------------------ # Database connection parameters session_uid = self.db_connection_info.session_id database_url = self.db_connection_info.dburl database_name = self.db_connection_info.dbname database_auth = self.db_connection_info.dbauth # ------------------------------------------------------ # pilot description and resource configuration number_cores = compute_pilot['description']['cores'] runtime = compute_pilot['description']['runtime'] queue = compute_pilot['description']['queue'] project = compute_pilot['description']['project'] cleanup = compute_pilot['description']['cleanup'] resource_key = compute_pilot['description']['resource'] schema = compute_pilot['description']['access_schema'] memory = compute_pilot['description']['memory'] pilot_sandbox = compute_pilot['sandbox'] global_sandbox = compute_pilot['global_sandbox'] # we expand and exchange keys in the resource config, # depending on the selected schema so better use a deep # copy.. resource_cfg = self._session.get_resource_config( resource_key, schema) # import pprint # pprint.pprint (resource_cfg) # ------------------------------------------------------ # get parameters from cfg, set defaults where needed agent_mongodb_endpoint = resource_cfg.get( 'agent_mongodb_endpoint', database_url) agent_spawner = resource_cfg.get( 'agent_spawner', DEFAULT_AGENT_SPAWNER) agent_type = resource_cfg.get('agent_type', DEFAULT_AGENT_TYPE) agent_scheduler = resource_cfg.get('agent_scheduler') tunnel_bind_device = resource_cfg.get( 'tunnel_bind_device') default_queue = resource_cfg.get('default_queue') forward_tunnel_endpoint = resource_cfg.get( 'forward_tunnel_endpoint') js_endpoint = resource_cfg.get('job_manager_endpoint') lrms = resource_cfg.get('lrms') mpi_launch_method = resource_cfg.get( 'mpi_launch_method') pre_bootstrap = resource_cfg.get('pre_bootstrap') python_interpreter = resource_cfg.get( 'python_interpreter') spmd_variation = resource_cfg.get('spmd_variation') task_launch_method = resource_cfg.get( 'task_launch_method') rp_version = resource_cfg.get('rp_version', DEFAULT_RP_VERSION) virtenv_mode = resource_cfg.get( 'virtenv_mode', DEFAULT_VIRTENV_MODE) virtenv = resource_cfg.get('virtenv', DEFAULT_VIRTENV) stage_cacerts = resource_cfg.get( 'stage_cacerts', 'False') if stage_cacerts.lower() == 'true': stage_cacerts = True else: stage_cacerts = False # expand variables in virtenv string virtenv = virtenv % { 'pilot_sandbox': saga.Url(pilot_sandbox).path, 'global_sandbox': saga.Url(global_sandbox).path } # Check for deprecated global_virtenv global_virtenv = resource_cfg.get('global_virtenv') if global_virtenv: logger.warn( "'global_virtenv' keyword is deprecated -- use 'virtenv' and 'virtenv_mode'" ) virtenv = global_virtenv virtenv_mode = 'use' # set default scheme, host, port and dbname if not set db_url = saga.Url(agent_mongodb_endpoint) if not db_url.scheme: db_url.scheme = 'mongodb' if not db_url.host: db_url.host = 'localhost' if not db_url.port: db_url.port = 27017 if not database_name: database_name = 'radicalpilot' # Create a host:port string for use by the bootstrapper. database_hostport = "%s:%d" % (db_url.host, db_url.port) # ------------------------------------------------------ # Copy the bootstrap shell script. This also creates # the sandbox. We use always "default_bootstrapper.sh" bootstrapper = 'default_bootstrapper.sh' bootstrapper_path = os.path.abspath("%s/../bootstrapper/%s" \ % (mod_dir, bootstrapper)) msg = "Using bootstrapper %s" % bootstrapper_path logentries.append(Logentry(msg, logger=logger.info)) bs_script_url = saga.Url("file://localhost/%s" % bootstrapper_path) bs_script_tgt = saga.Url("%s/pilot_bootstrapper.sh" % pilot_sandbox) msg = "Copying bootstrapper '%s' to agent sandbox (%s)." \ % (bs_script_url, bs_script_tgt) logentries.append(Logentry(msg, logger=logger.debug)) bs_script = saga.filesystem.File(bs_script_url, session=self._session) bs_script.copy(bs_script_tgt, flags=saga.filesystem.CREATE_PARENTS) bs_script.close() # ------------------------------------------------------ # the version of the agent is derived from # rp_version, which has the following format # and interpretation: # # case rp_version: # @<token>: # @tag/@branch/@commit: # no sdist staging # git clone $github_base radical.pilot.src # (cd radical.pilot.src && git checkout token) # pip install -t $VIRTENV/rp_install/ radical.pilot.src # rm -rf radical.pilot.src # export PYTHONPATH=$VIRTENV/rp_install:$PYTHONPATH # # release: # no sdist staging # pip install -t $VIRTENV/rp_install radical.pilot # export PYTHONPATH=$VIRTENV/rp_install:$PYTHONPATH # # local: # needs sdist staging # tar zxf $sdist.tgz # pip install -t $VIRTENV/rp_install $sdist/ # export PYTHONPATH=$VIRTENV/rp_install:$PYTHONPATH # # debug: # needs sdist staging # tar zxf $sdist.tgz # pip install -t $SANDBOX/rp_install $sdist/ # export PYTHONPATH=$SANDBOX/rp_install:$PYTHONPATH # # installed: # no sdist staging # true # esac # # virtenv_mode # private : error if ve exists, otherwise create, then use # update : update if ve exists, otherwise create, then use # create : use if ve exists, otherwise create, then use # use : use if ve exists, otherwise error, then exit # recreate: delete if ve exists, otherwise create, then use # # examples : # [email protected] # virtenv@devel # virtenv@release # virtenv@installed # stage@local # stage@/tmp/my_agent.py # # Note that some combinations may be invalid, # specifically in the context of virtenv_mode. If, for # example, virtenv_mode is 'use', then the 'virtenv:tag' # will not make sense, as the virtenv is not updated. # In those cases, the virtenv_mode is honored, and # a warning is printed. # # Also, the 'stage' mode can only be combined with the # 'local' source, or with a path to the agent (relative # to mod_dir, or absolute). # # A rp_version which does not adhere to the # above syntax is ignored, and the fallback stage@local # is used. if not rp_version.startswith('@') and \ not rp_version in ['installed', 'local', 'debug']: raise ValueError("invalid rp_version '%s'" % rp_version) stage_sdist = True if rp_version in ['installed', 'release']: stage_sdist = False if rp_version.startswith('@'): stage_sdist = False rp_version = rp_version[1:] # strip '@' # ------------------------------------------------------ # Copy the rp sdist if needed. We actually also stage # the sdists for radical.utils and radical.saga, so that # we have the complete stack to install... if stage_sdist: for path in [ ru.sdist_path, saga.sdist_path, sdist_path ]: sdist_url = saga.Url("file://localhost/%s" % path) msg = "Copying sdist '%s' to sdist sandbox (%s)." % ( sdist_url, pilot_sandbox) logentries.append( Logentry(msg, logger=logger.debug)) sdist_file = saga.filesystem.File(sdist_url) sdist_file.copy("%s/" % (str(pilot_sandbox))) sdist_file.close() # ------------------------------------------------------ # some machines cannot run pip due to outdated ca certs. # For those, we also stage an updated cert bundle if stage_cacerts: cc_path = os.path.abspath("%s/../bootstrapper/%s" \ % (mod_dir, 'cacert.pem.gz')) cc_script_url = saga.Url("file://localhost/%s" % cc_path) cc_script_tgt = saga.Url("%s/cacert.pem.gz" % pilot_sandbox) cc_script = saga.filesystem.File( cc_script_url, session=self._session) cc_script.copy( cc_script_tgt, flags=saga.filesystem.CREATE_PARENTS) cc_script.close() # ------------------------------------------------------ # sanity checks if not agent_spawner: raise RuntimeError("missing agent spawner") if not agent_scheduler: raise RuntimeError("missing agent scheduler") if not lrms: raise RuntimeError("missing LRMS") if not mpi_launch_method: raise RuntimeError("missing mpi launch method") if not task_launch_method: raise RuntimeError("missing task launch method") # massage some values debug_level = os.environ.get( 'RADICAL_PILOT_AGENT_VERBOSE', logger.level) try: debug_level = int(debug_level) except ValueError: debug_level = { 'CRITICAL': 1, 'ERROR': 2, 'WARNING': 3, 'WARN': 3, 'INFO': 4, 'DEBUG': 5 }.get(debug_level, 0) if not queue: queue = default_queue if cleanup and isinstance(cleanup, bool): cleanup = 'luve' # l : log files # u : unit work dirs # v : virtualenv # e : everything (== pilot sandbox) # # we never cleanup virtenvs which are not private if virtenv_mode is not 'private': cleanup = cleanup.replace('v', '') sdists = ':'.join( [ru.sdist_name, saga.sdist_name, sdist_name]) # set mandatory args bootstrap_args = "" bootstrap_args += " -b '%s'" % sdists bootstrap_args += " -c '%s'" % number_cores bootstrap_args += " -d '%s'" % debug_level bootstrap_args += " -g '%s'" % virtenv bootstrap_args += " -j '%s'" % task_launch_method bootstrap_args += " -k '%s'" % mpi_launch_method bootstrap_args += " -l '%s'" % lrms bootstrap_args += " -m '%s'" % database_hostport bootstrap_args += " -n '%s'" % database_name bootstrap_args += " -o '%s'" % agent_spawner bootstrap_args += " -p '%s'" % pilot_id bootstrap_args += " -q '%s'" % agent_scheduler bootstrap_args += " -r '%s'" % runtime bootstrap_args += " -s '%s'" % session_uid bootstrap_args += " -t '%s'" % agent_type bootstrap_args += " -u '%s'" % virtenv_mode bootstrap_args += " -v '%s'" % rp_version # set optional args if database_auth: bootstrap_args += " -a '%s'" % database_auth if tunnel_bind_device: bootstrap_args += " -D '%s'" % tunnel_bind_device if pre_bootstrap: bootstrap_args += " -e '%s'" % "' -e '".join( pre_bootstrap) if forward_tunnel_endpoint: bootstrap_args += " -f '%s'" % forward_tunnel_endpoint if python_interpreter: bootstrap_args += " -i '%s'" % python_interpreter if cleanup: bootstrap_args += " -x '%s'" % cleanup # ------------------------------------------------------ # now that the script is in place and we know where it is, # we can launch the agent js_url = saga.Url(js_endpoint) logger.debug("saga.job.Service ('%s')" % js_url) if js_url in self._shared_worker_data['job_services']: js = self._shared_worker_data['job_services'][ js_url] else: js = saga.job.Service(js_url, session=self._session) self._shared_worker_data['job_services'][ js_url] = js # ------------------------------------------------------ # Create SAGA Job description and submit the pilot job jd = saga.job.Description() jd.executable = "/bin/bash" jd.arguments = [ "-l pilot_bootstrapper.sh", bootstrap_args ] jd.working_directory = saga.Url(pilot_sandbox).path jd.project = project jd.output = "agent.out" jd.error = "agent.err" jd.total_cpu_count = number_cores jd.wall_time_limit = runtime jd.total_physical_memory = memory jd.queue = queue # Set the SPMD variation only if required if spmd_variation: jd.spmd_variation = spmd_variation if 'RADICAL_PILOT_PROFILE' in os.environ: jd.environment = {'RADICAL_PILOT_PROFILE': 'TRUE'} logger.debug("Bootstrap command line: %s %s" % (jd.executable, jd.arguments)) msg = "Submitting SAGA job with description: %s" % str( jd.as_dict()) logentries.append(Logentry(msg, logger=logger.debug)) pilotjob = js.create_job(jd) pilotjob.run() # do a quick error check if pilotjob.state == saga.FAILED: raise RuntimeError("SAGA Job state is FAILED.") saga_job_id = pilotjob.id self._shared_worker_data['job_ids'][pilot_id] = [ saga_job_id, js_url ] msg = "SAGA job submitted with job id %s" % str( saga_job_id) logentries.append(Logentry(msg, logger=logger.debug)) # # ------------------------------------------------------ log_dicts = list() for le in logentries: log_dicts.append(le.as_dict()) # Update the Pilot's state to 'PENDING_ACTIVE' if SAGA job submission was successful. ts = datetime.datetime.utcnow() ret = pilot_col.update( { "_id": pilot_id, "state": 'Launching' }, { "$set": { "state": PENDING_ACTIVE, "saga_job_id": saga_job_id }, "$push": { "statehistory": { "state": PENDING_ACTIVE, "timestamp": ts } }, "$pushAll": { "log": log_dicts } }) if ret['n'] == 0: # could not update, probably because the agent is # running already. Just update state history and # jobid then # FIXME: make sure of the agent state! ret = pilot_col.update({"_id": pilot_id}, { "$set": { "saga_job_id": saga_job_id }, "$push": { "statehistory": { "state": PENDING_ACTIVE, "timestamp": ts } }, "$pushAll": { "log": log_dicts } }) except Exception as e: # Update the Pilot's state 'FAILED'. out, err, log = self._get_pilot_logs( pilot_col, pilot_id) ts = datetime.datetime.utcnow() # FIXME: we seem to be unable to bson/json handle saga # log messages containing an '#'. This shows up here. # Until we find a clean workaround, make log shorter and # rely on saga logging to reveal the problem. msg = "Pilot launching failed! (%s)" % e logentries.append(Logentry(msg)) log_dicts = list() log_messages = list() for le in logentries: log_dicts.append(le.as_dict()) log_messages.append(le.message) pilot_col.update( { "_id": pilot_id, "state": { "$ne": FAILED } }, { "$set": { "state": FAILED, "stdout": out, "stderr": err, "logfile": log }, "$push": { "statehistory": { "state": FAILED, "timestamp": ts } }, "$pushAll": { "log": log_dicts } }) logger.exception('\n'.join(log_messages)) except SystemExit as e: logger.exception( "pilot launcher thread caught system exit -- forcing application shutdown" ) import thread thread.interrupt_main()
def schedule_compute_units(self, pilot_uid, units): """Request the scheduling of one or more ComputeUnits on a ComputePilot. """ try: cu_transfer = list() cu_notransfer = list() # Get some information about the pilot sandbox from the database. pilot_info = self._db.get_pilots(pilot_ids=pilot_uid) # TODO: this hack below relies on what?! That there is just one pilot? pilot_sandbox = pilot_info[0]['sandbox'] # Split units into two different lists: the first list contains the CUs # that need file transfer and the second list contains the CUs that # don't. The latter is added to the pilot directly, while the former # is added to the transfer queue. for unit in units: # Create object for staging status tracking unit.FTW_Input_Status = None unit.FTW_Input_Directives = [] unit.Agent_Input_Status = None unit.Agent_Input_Directives = [] unit.FTW_Output_Status = None unit.FTW_Output_Directives = [] unit.Agent_Output_Status = None unit.Agent_Output_Directives = [] # Split the input staging directives over the transfer worker and the agent input_sds = unit.description.input_staging if not isinstance(input_sds, list): # Ugly, but is a workaround for iterating on attribute interface # TODO: Verify if this piece of code is actually still required if input_sds: input_sds = [input_sds] else: input_sds = [] for input_sd_entry in input_sds: action = input_sd_entry['action'] source = Url(input_sd_entry['source']) target = Url(input_sd_entry['target']) new_sd = { 'action': action, 'source': str(source), 'target': str(target), 'flags': input_sd_entry['flags'], 'priority': input_sd_entry['priority'], 'state': PENDING } if action in [LINK, COPY, MOVE]: unit.Agent_Input_Directives.append(new_sd) unit.Agent_Input_Status = PENDING elif action in [TRANSFER]: if source.scheme and source.scheme != 'file': # If there is a scheme and it is different than "file", # assume a remote pull from the agent unit.Agent_Input_Directives.append(new_sd) unit.Agent_Input_Status = PENDING else: # Transfer from local to sandbox unit.FTW_Input_Directives.append(new_sd) unit.FTW_Input_Status = PENDING else: logger.warn( 'Not sure if action %s makes sense for input staging' % action) # Split the output staging directives over the transfer worker and the agent output_sds = unit.description.output_staging if not isinstance(output_sds, list): # Ugly, but is a workaround for iterating on att iface # TODO: Verify if this piece of code is actually still required if output_sds: output_sds = [output_sds] else: output_sds = [] for output_sds_entry in output_sds: action = output_sds_entry['action'] source = Url(output_sds_entry['source']) target = Url(output_sds_entry['target']) new_sd = { 'action': action, 'source': str(source), 'target': str(target), 'flags': output_sds_entry['flags'], 'priority': output_sds_entry['priority'], 'state': PENDING } if action == LINK or action == COPY or action == MOVE: unit.Agent_Output_Directives.append(new_sd) unit.Agent_Output_Status = NEW elif action == TRANSFER: if target.scheme and target.scheme != 'file': # If there is a scheme and it is different than "file", # assume a remote push from the agent unit.Agent_Output_Directives.append(new_sd) unit.Agent_Output_Status = NEW else: # Transfer from sandbox back to local unit.FTW_Output_Directives.append(new_sd) unit.FTW_Output_Status = NEW else: logger.warn( 'Not sure if action %s makes sense for output staging' % action) if unit.FTW_Input_Directives or unit.Agent_Input_Directives: log = "Scheduled for data transfer to ComputePilot %s." % pilot_uid self._db.set_compute_unit_state(unit.uid, PENDING_INPUT_STAGING, log) cu_transfer.append(unit) else: cu_notransfer.append(unit) # Bulk-add all non-transfer units- self._db.assign_compute_units_to_pilot(units=cu_notransfer, pilot_uid=pilot_uid, pilot_sandbox=pilot_sandbox) self._db.assign_compute_units_to_pilot(units=cu_transfer, pilot_uid=pilot_uid, pilot_sandbox=pilot_sandbox) for unit in cu_notransfer: log = "Scheduled for execution on ComputePilot %s." % pilot_uid self._db.set_compute_unit_state(unit.uid, PENDING_EXECUTION, log) #self._set_state(uid, PENDING_EXECUTION, log) logger.info( "Scheduled ComputeUnits %s for execution on ComputePilot '%s'." % (cu_notransfer, pilot_uid)) except Exception, e: logger.exception('error in unit manager controller (schedule())') raise
def run(self): """run() is called when the process is started via PilotManagerController.start(). """ # make sure to catch sys.exit (which raises SystemExit) try: logger.debug( "Worker thread (ID: %s[%s]) for UnitManager %s started." % (self.name, self.ident, self._um_id)) # transfer results contains the futures to the results of the # asynchronous transfer operations. transfer_results = list() while not self._stop.is_set(): # ================================================================= # # Check and update units. This needs to be optimized at # some point, i.e., state pulling should be conditional # or triggered by a tailable MongoDB cursor, etc. unit_list = self._db.get_compute_units( unit_manager_id=self._um_id) action = False for unit in unit_list: unit_id = str(unit["_id"]) new_state = unit["state"] if unit_id in self._shared_data: old_state = self._shared_data[unit_id]["data"]["state"] else: old_state = None self._shared_data_lock.acquire() self._shared_data[unit_id] = { 'data': unit, 'callbacks': [], 'facade_object': None } self._shared_data_lock.release() self._shared_data_lock.acquire() self._shared_data[unit_id]["data"] = unit self._shared_data_lock.release() if new_state != old_state: # On a state change, we fire zee callbacks. logger.info( "RUN ComputeUnit '%s' state changed from '%s' to '%s'." % (unit_id, old_state, new_state)) # The state of the unit has changed, We call all # unit-level callbacks to propagate this. self.call_unit_state_callbacks(unit_id, new_state) action = True # After the first iteration, we are officially initialized! if not self._initialized.is_set(): self._initialized.set() # sleep a little if this cycle was idle if not action: time.sleep(IDLE_TIME) except SystemExit as e: logger.exception( "unit manager controller thread caught system exit -- forcing application shutdown" ) import thread thread.interrupt_main() finally: # shut down the autonomous input / output transfer worker(s) for worker in self._input_file_transfer_worker_pool: logger.debug("uworker %s stops itransfer %s" % (self.name, worker.name)) worker.stop() logger.debug("uworker %s stopped itransfer %s" % (self.name, worker.name)) for worker in self._output_file_transfer_worker_pool: logger.debug("uworker %s stops otransfer %s" % (self.name, worker.name)) worker.stop() logger.debug("uworker %s stopped otransfer %s" % (self.name, worker.name))
def register_cancel_pilots_request(self, pilot_ids=None): """Registers one or more pilots for cancelation. """ if pilot_ids is None: pilot_ids = list() for pilot in self._db.get_pilots(pilot_manager_id=self._pm_id) : pilot_ids.append (str(pilot["_id"])) self._db.send_command_to_pilot(COMMAND_CANCEL_PILOT, pilot_ids=pilot_ids) logger.info("Sent 'COMMAND_CANCEL_PILOT' command to pilots %s.", pilot_ids) # pilots which are in ACTIVE state should now have time to react on the # CANCEL command sent above. Meanwhile, we'll cancel all pending # pilots. If that is done, we wait a little, say 10 seconds, to give # the pilot time to pick up the request and shut down -- but if it does # not do that, it will get killed the hard way... delayed_cancel = list() for pilot_id in pilot_ids : if pilot_id in self._shared_data : # read state fomr _shared_data only once, so that it does not # change under us... old_state = str(self._shared_data[pilot_id]["data"]["state"]) logger.warn ("actively cancel pilot %s state: %s" % (pilot_id, old_state)) if old_state in [DONE, FAILED, CANCELED] : logger.warn ("can't actively cancel pilot %s: already in final state" % pilot_id) elif old_state in [PENDING_LAUNCH, LAUNCHING, PENDING_ACTIVE] : if pilot_id in self._shared_worker_data['job_ids'] : try : job_id, js_url = self._shared_worker_data['job_ids'][pilot_id] self._shared_data[pilot_id]["data"]["state"] = CANCELING logger.info ("actively cancel pilot %s (%s, %s)" % (pilot_id, job_id, js_url)) js = self._shared_worker_data['job_services'][js_url] job = js.get_job (job_id) job.cancel () except Exception as e : logger.exception ('pilot cancelation failed') else : logger.warn ("can't actively cancel pilot %s: no job id known" % pilot_id) logger.debug (pprint.pformat (self._shared_worker_data)) else : logger.debug ("delay to actively cancel pilot %s: state %s" % (pilot_id, old_state)) delayed_cancel.append (pilot_id) else : logger.warn ("can't actively cancel pilot %s: unknown pilot" % pilot_id) logger.debug (pprint.pformat (self._shared_data)) # now tend to all delayed cancellation requests (ie. active pilots) -- # if there are any if delayed_cancel : # grant some levay to the unruly children... time.sleep (10) for pilot_id in delayed_cancel : if pilot_id in self._shared_worker_data['job_ids'] : try : job_id, js_url = self._shared_worker_data['job_ids'][pilot_id] logger.info ("actively cancel pilot %s (delayed) (%s, %s)" % (pilot_id, job_id, js_url)) js = self._shared_worker_data['job_services'][js_url] job = js.get_job (job_id) job.cancel () except Exception as e : logger.warn ('delayed pilot cancelation failed. ' 'This is not necessarily a problem.') else : logger.warn ("can't actively cancel pilot %s: no job id known (delayed)" % pilot_id) logger.debug (pprint.pformat (self._shared_worker_data))
def run(self): """run() is called when the process is started via PilotManagerController.start(). """ # make sure to catch sys.exit (which raises SystemExit) try : logger.debug("Worker thread (ID: %s[%s]) for UnitManager %s started." % (self.name, self.ident, self._um_id)) # transfer results contains the futures to the results of the # asynchronous transfer operations. transfer_results = list() while not self._stop.is_set(): # ================================================================= # # Check and update units. This needs to be optimized at # some point, i.e., state pulling should be conditional # or triggered by a tailable MongoDB cursor, etc. unit_list = self._db.get_compute_units(unit_manager_id=self._um_id) action = False for unit in unit_list: unit_id = str(unit["_id"]) new_state = unit["state"] if unit_id in self._shared_data: old_state = self._shared_data[unit_id]["data"]["state"] else: old_state = None self._shared_data_lock.acquire() self._shared_data[unit_id] = { 'data': unit, 'callbacks': [], 'facade_object': None } self._shared_data_lock.release() self._shared_data_lock.acquire() self._shared_data[unit_id]["data"] = unit self._shared_data_lock.release() if new_state != old_state: # On a state change, we fire zee callbacks. logger.info("RUN ComputeUnit '%s' state changed from '%s' to '%s'." % (unit_id, old_state, new_state)) # The state of the unit has changed, We call all # unit-level callbacks to propagate this. self.call_unit_state_callbacks(unit_id, new_state) action = True # After the first iteration, we are officially initialized! if not self._initialized.is_set(): self._initialized.set() # sleep a little if this cycle was idle if not action : time.sleep(IDLE_TIME) except SystemExit as e : logger.exception ("unit manager controller thread caught system exit -- forcing application shutdown") import thread thread.interrupt_main () finally : # shut down the autonomous input / output transfer worker(s) for worker in self._input_file_transfer_worker_pool: logger.debug("uworker %s stops itransfer %s" % (self.name, worker.name)) worker.stop () logger.debug("uworker %s stopped itransfer %s" % (self.name, worker.name)) for worker in self._output_file_transfer_worker_pool: logger.debug("uworker %s stops otransfer %s" % (self.name, worker.name)) worker.stop () logger.debug("uworker %s stopped otransfer %s" % (self.name, worker.name))
def run(self): """Starts the process when Process.start() is called. """ # make sure to catch sys.exit (which raises SystemExit) try : # Try to connect to the database and create a tailable cursor. try: connection = self.db_connection_info.get_db_handle() db = connection[self.db_connection_info.dbname] um_col = db["%s.cu" % self.db_connection_info.session_id] logger.debug("Connected to MongoDB. Serving requests for UnitManager %s." % self.unit_manager_id) except Exception as e: logger.exception("Connection error: %s" % e) return while not self._stop.is_set(): compute_unit = None # See if we can find a ComputeUnit that is waiting for # output file transfer. ts = datetime.datetime.utcnow() compute_unit = um_col.find_and_modify( query={"unitmanager": self.unit_manager_id, "FTW_Output_Status": PENDING}, update={"$set" : {"FTW_Output_Status": EXECUTING, "state": STAGING_OUTPUT}, "$push": {"statehistory": {"state": STAGING_OUTPUT, "timestamp": ts}}}, limit=BULK_LIMIT ) # FIXME: AM: find_and_modify is not bulkable! state = STAGING_OUTPUT #logger.info("OFTW after finding pending cus") if compute_unit is None: #logger.info("OFTW no cus, sleep") # Sleep a bit if no new units are available. time.sleep(IDLE_TIME) else: logger.info("OFTW cu found, progressing ...") compute_unit_id = None try: # We have found a new CU. Now we can process the transfer # directive(s) wit SAGA. compute_unit_id = str(compute_unit["_id"]) remote_sandbox = compute_unit["sandbox"] staging_directives = compute_unit["FTW_Output_Directives"] logger.info("Processing output file transfers for ComputeUnit %s" % compute_unit_id) # Loop over all staging directives and execute them. for sd in staging_directives: # Check if there was a cancel request state_doc = um_col.find_one( {"_id": compute_unit_id}, fields=["state"] ) if state_doc['state'] == CANCELED: logger.info("Compute Unit Canceled, interrupting output file transfers.") state = CANCELED break action = sd['action'] source = sd['source'] target = sd['target'] flags = sd['flags'] # Mark the beginning of transfer this StagingDirective um_col.find_and_modify( query={"_id" : compute_unit_id, 'FTW_Output_Status': EXECUTING, 'FTW_Output_Directives.state': PENDING, 'FTW_Output_Directives.source': sd['source'], 'FTW_Output_Directives.target': sd['target'], }, update={'$set': {'FTW_Output_Directives.$.state': EXECUTING}, '$push': {'log': { 'timestamp': datetime.datetime.utcnow(), 'message' : 'Starting transfer of %s' % source}} } ) abs_source = "%s/%s" % (remote_sandbox, source) if os.path.basename(target) == target: abs_target = "file://localhost%s" % os.path.join(os.getcwd(), target) else: abs_target = "file://localhost%s" % os.path.abspath(target) log_msg = "Transferring output file %s -> %s" % (abs_source, abs_target) logger.debug(log_msg) logger.debug ("saga.fs.File ('%s')" % saga.Url(abs_source)) output_file = saga.filesystem.File(saga.Url(abs_source), session=self._session ) if CREATE_PARENTS in flags: copy_flags = saga.filesystem.CREATE_PARENTS else: copy_flags = 0 logger.debug ("saga.fs.File.copy ('%s')" % saga.Url(abs_target)) output_file.copy(saga.Url(abs_target), flags=copy_flags) output_file.close() # If all went fine, update the state of this StagingDirective to Done um_col.find_and_modify( query={"_id" : compute_unit_id, 'FTW_Output_Status': EXECUTING, 'FTW_Output_Directives.state': EXECUTING, 'FTW_Output_Directives.source': sd['source'], 'FTW_Output_Directives.target': sd['target'], }, update={'$set': {'FTW_Output_Directives.$.state': DONE}, '$push': {'log': { 'timestamp': datetime.datetime.utcnow(), 'message' : log_msg}} } ) except Exception as e : # Update the CU's state to 'FAILED'. ts = datetime.datetime.utcnow() log_message = "Output transfer failed: %s" % e # TODO: not only mark the CU as failed, but also the specific Directive um_col.update({'_id': compute_unit_id}, { '$set': {'state': FAILED}, '$push': { 'statehistory': {'state': FAILED, 'timestamp': ts}, 'log': {'message': log_message, 'timestamp': ts} } }) logger.exception (log_message) # Code below is only to be run by the "first" or only worker if self._worker_number > 1: continue # If the CU was canceled we can skip the remainder of this loop. if state == CANCELED: continue # # Check to see if there are more active Directives, if not, we are Done # cursor_w = um_col.find({"unitmanager": self.unit_manager_id, "$or": [ {"Agent_Output_Status": EXECUTING}, {"FTW_Output_Status": EXECUTING} ] } ) # Iterate over all the returned CUs (if any) for cu in cursor_w: # See if there are any FTW Output Directives still pending if cu['FTW_Output_Status'] == EXECUTING and \ not any(d['state'] == EXECUTING or d['state'] == PENDING for d in cu['FTW_Output_Directives']): # All Output Directives for this FTW are done, mark the CU accordingly um_col.update({"_id": cu["_id"]}, {'$set': {'FTW_Output_Status': DONE}, '$push': {'log': { 'timestamp': datetime.datetime.utcnow(), 'message' : 'All FTW output staging directives done - %d.' % self._worker_number}} } ) # See if there are any Agent Output Directives still pending if cu['Agent_Output_Status'] == EXECUTING and \ not any(d['state'] == EXECUTING or d['state'] == PENDING for d in cu['Agent_Output_Directives']): # All Output Directives for this Agent are done, mark the CU accordingly um_col.update({"_id": cu["_id"]}, {'$set': {'Agent_Output_Status': DONE}, '$push': {'log': { 'timestamp': datetime.datetime.utcnow(), 'message' : 'All Agent Output Staging Directives done-%d.' % self._worker_number}} } ) # # Check for all CUs if both Agent and FTW staging is done, we can then mark the CU Done # ts = datetime.datetime.utcnow() um_col.find_and_modify( query={"unitmanager": self.unit_manager_id, # TODO: Now that our state model is linear, # we probably don't need to check Agent_Output_Status anymore. # Given that it is not updates by the agent currently, disable it here. #"Agent_Output_Status": { "$in": [ None, DONE ] }, "FTW_Output_Status": { "$in": [ None, DONE ] }, "state": STAGING_OUTPUT }, update={"$set": { "state": DONE }, "$push": { "statehistory": {"state": DONE, "timestamp": ts} } } ) except SystemExit as e : logger.exception("output file transfer thread caught system exit -- forcing application shutdown") import thread thread.interrupt_main ()
def run(self): """Starts the process when Process.start() is called. """ # make sure to catch sys.exit (which raises SystemExit) try: # Get directory where this module lives mod_dir = os.path.dirname(os.path.realpath(__file__)) # Try to connect to the database try: connection = self.db_connection_info.get_db_handle() db = connection[self.db_connection_info.dbname] pilot_col = db["%s.p" % self.db_connection_info.session_id] logger.debug("Connected to MongoDB. Serving requests for PilotManager %s." % self.pilot_manager_id) except Exception as e: logger.exception("Connection error: %s" % e) return last_job_check = time.time() while not self._stop.is_set(): # Periodically, we pull up all ComputePilots that are pending # execution or were last seen executing and check if the corresponding # SAGA job is still pending in the queue. If that is not the case, # we assume that the job has failed for some reasons and update # the state of the ComputePilot accordingly. if last_job_check + JOB_CHECK_INTERVAL < time.time(): last_job_check = time.time() self.check_pilot_states(pilot_col) # See if we can find a ComputePilot that is waiting to be launched. # If we find one, we use SAGA to create a job service, a job # description and a job that is then send to the local or remote # queueing system. If this succedes, we set the ComputePilot's # state to pending, otherwise to failed. compute_pilot = None ts = datetime.datetime.utcnow() compute_pilot = pilot_col.find_and_modify( query={"pilotmanager": self.pilot_manager_id, "state": PENDING_LAUNCH}, update={ "$set": {"state": LAUNCHING}, "$push": {"statehistory": {"state": LAUNCHING, "timestamp": ts}}, }, ) if not compute_pilot: time.sleep(IDLE_TIMER) else: try: # ------------------------------------------------------ # # LAUNCH THE PILOT AGENT VIA SAGA # logentries = [] pilot_id = str(compute_pilot["_id"]) logger.info("Launching ComputePilot %s" % pilot_id) # ------------------------------------------------------ # Database connection parameters session_uid = self.db_connection_info.session_id database_url = self.db_connection_info.dburl database_name = self.db_connection_info.dbname database_auth = self.db_connection_info.dbauth # ------------------------------------------------------ # pilot description and resource configuration number_cores = compute_pilot["description"]["cores"] runtime = compute_pilot["description"]["runtime"] queue = compute_pilot["description"]["queue"] project = compute_pilot["description"]["project"] cleanup = compute_pilot["description"]["cleanup"] resource_key = compute_pilot["description"]["resource"] schema = compute_pilot["description"]["access_schema"] memory = compute_pilot["description"]["memory"] pilot_sandbox = compute_pilot["sandbox"] global_sandbox = compute_pilot["global_sandbox"] # we expand and exchange keys in the resource config, # depending on the selected schema so better use a deep # copy.. resource_cfg = self._session.get_resource_config(resource_key, schema) # import pprint # pprint.pprint (resource_cfg) # ------------------------------------------------------ # get parameters from cfg, set defaults where needed agent_mongodb_endpoint = resource_cfg.get("agent_mongodb_endpoint", database_url) agent_spawner = resource_cfg.get("agent_spawner", DEFAULT_AGENT_SPAWNER) agent_type = resource_cfg.get("agent_type", DEFAULT_AGENT_TYPE) agent_scheduler = resource_cfg.get("agent_scheduler") tunnel_bind_device = resource_cfg.get("tunnel_bind_device") default_queue = resource_cfg.get("default_queue") forward_tunnel_endpoint = resource_cfg.get("forward_tunnel_endpoint") js_endpoint = resource_cfg.get("job_manager_endpoint") lrms = resource_cfg.get("lrms") mpi_launch_method = resource_cfg.get("mpi_launch_method") pre_bootstrap = resource_cfg.get("pre_bootstrap") python_interpreter = resource_cfg.get("python_interpreter") spmd_variation = resource_cfg.get("spmd_variation") task_launch_method = resource_cfg.get("task_launch_method") rp_version = resource_cfg.get("rp_version", DEFAULT_RP_VERSION) virtenv_mode = resource_cfg.get("virtenv_mode", DEFAULT_VIRTENV_MODE) virtenv = resource_cfg.get("virtenv", DEFAULT_VIRTENV) stage_cacerts = resource_cfg.get("stage_cacerts", "False") if stage_cacerts.lower() == "true": stage_cacerts = True else: stage_cacerts = False # expand variables in virtenv string virtenv = virtenv % { "pilot_sandbox": saga.Url(pilot_sandbox).path, "global_sandbox": saga.Url(global_sandbox).path, } # Check for deprecated global_virtenv global_virtenv = resource_cfg.get("global_virtenv") if global_virtenv: logger.warn("'global_virtenv' keyword is deprecated -- use 'virtenv' and 'virtenv_mode'") virtenv = global_virtenv virtenv_mode = "use" # set default scheme, host, port and dbname if not set db_url = saga.Url(agent_mongodb_endpoint) if not db_url.scheme: db_url.scheme = "mongodb" if not db_url.host: db_url.host = "localhost" if not db_url.port: db_url.port = 27017 if not database_name: database_name = "radicalpilot" # Create a host:port string for use by the bootstrapper. database_hostport = "%s:%d" % (db_url.host, db_url.port) # ------------------------------------------------------ # Copy the bootstrap shell script. This also creates # the sandbox. We use always "default_bootstrapper.sh" bootstrapper = "default_bootstrapper.sh" bootstrapper_path = os.path.abspath("%s/../bootstrapper/%s" % (mod_dir, bootstrapper)) msg = "Using bootstrapper %s" % bootstrapper_path logentries.append(Logentry(msg, logger=logger.info)) bs_script_url = saga.Url("file://localhost/%s" % bootstrapper_path) bs_script_tgt = saga.Url("%s/pilot_bootstrapper.sh" % pilot_sandbox) msg = "Copying bootstrapper '%s' to agent sandbox (%s)." % (bs_script_url, bs_script_tgt) logentries.append(Logentry(msg, logger=logger.debug)) bs_script = saga.filesystem.File(bs_script_url, session=self._session) bs_script.copy(bs_script_tgt, flags=saga.filesystem.CREATE_PARENTS) bs_script.close() # ------------------------------------------------------ # the version of the agent is derived from # rp_version, which has the following format # and interpretation: # # case rp_version: # @<token>: # @tag/@branch/@commit: # no sdist staging # git clone $github_base radical.pilot.src # (cd radical.pilot.src && git checkout token) # pip install -t $VIRTENV/rp_install/ radical.pilot.src # rm -rf radical.pilot.src # export PYTHONPATH=$VIRTENV/rp_install:$PYTHONPATH # # release: # no sdist staging # pip install -t $VIRTENV/rp_install radical.pilot # export PYTHONPATH=$VIRTENV/rp_install:$PYTHONPATH # # local: # needs sdist staging # tar zxf $sdist.tgz # pip install -t $VIRTENV/rp_install $sdist/ # export PYTHONPATH=$VIRTENV/rp_install:$PYTHONPATH # # debug: # needs sdist staging # tar zxf $sdist.tgz # pip install -t $SANDBOX/rp_install $sdist/ # export PYTHONPATH=$SANDBOX/rp_install:$PYTHONPATH # # installed: # no sdist staging # true # esac # # virtenv_mode # private : error if ve exists, otherwise create, then use # update : update if ve exists, otherwise create, then use # create : use if ve exists, otherwise create, then use # use : use if ve exists, otherwise error, then exit # recreate: delete if ve exists, otherwise create, then use # # examples : # [email protected] # virtenv@devel # virtenv@release # virtenv@installed # stage@local # stage@/tmp/my_agent.py # # Note that some combinations may be invalid, # specifically in the context of virtenv_mode. If, for # example, virtenv_mode is 'use', then the 'virtenv:tag' # will not make sense, as the virtenv is not updated. # In those cases, the virtenv_mode is honored, and # a warning is printed. # # Also, the 'stage' mode can only be combined with the # 'local' source, or with a path to the agent (relative # to mod_dir, or absolute). # # A rp_version which does not adhere to the # above syntax is ignored, and the fallback stage@local # is used. if not rp_version.startswith("@") and not rp_version in ["installed", "local", "debug"]: raise ValueError("invalid rp_version '%s'" % rp_version) stage_sdist = True if rp_version in ["installed", "release"]: stage_sdist = False if rp_version.startswith("@"): stage_sdist = False rp_version = rp_version[1:] # strip '@' # ------------------------------------------------------ # Copy the rp sdist if needed. We actually also stage # the sdists for radical.utils and radical.saga, so that # we have the complete stack to install... if stage_sdist: for path in [ru.sdist_path, saga.sdist_path, sdist_path]: sdist_url = saga.Url("file://localhost/%s" % path) msg = "Copying sdist '%s' to sdist sandbox (%s)." % (sdist_url, pilot_sandbox) logentries.append(Logentry(msg, logger=logger.debug)) sdist_file = saga.filesystem.File(sdist_url) sdist_file.copy("%s/" % (str(pilot_sandbox))) sdist_file.close() # ------------------------------------------------------ # some machines cannot run pip due to outdated ca certs. # For those, we also stage an updated cert bundle if stage_cacerts: cc_path = os.path.abspath("%s/../bootstrapper/%s" % (mod_dir, "cacert.pem.gz")) cc_script_url = saga.Url("file://localhost/%s" % cc_path) cc_script_tgt = saga.Url("%s/cacert.pem.gz" % pilot_sandbox) cc_script = saga.filesystem.File(cc_script_url, session=self._session) cc_script.copy(cc_script_tgt, flags=saga.filesystem.CREATE_PARENTS) cc_script.close() # ------------------------------------------------------ # sanity checks if not agent_spawner: raise RuntimeError("missing agent spawner") if not agent_scheduler: raise RuntimeError("missing agent scheduler") if not lrms: raise RuntimeError("missing LRMS") if not mpi_launch_method: raise RuntimeError("missing mpi launch method") if not task_launch_method: raise RuntimeError("missing task launch method") # massage some values debug_level = os.environ.get("RADICAL_PILOT_AGENT_VERBOSE", logger.level) try: debug_level = int(debug_level) except ValueError: debug_level = { "CRITICAL": 1, "ERROR": 2, "WARNING": 3, "WARN": 3, "INFO": 4, "DEBUG": 5, }.get(debug_level, 0) if not queue: queue = default_queue if cleanup and isinstance(cleanup, bool): cleanup = "luve" # l : log files # u : unit work dirs # v : virtualenv # e : everything (== pilot sandbox) # # we never cleanup virtenvs which are not private if virtenv_mode is not "private": cleanup = cleanup.replace("v", "") sdists = ":".join([ru.sdist_name, saga.sdist_name, sdist_name]) # set mandatory args bootstrap_args = "" bootstrap_args += " -b '%s'" % sdists bootstrap_args += " -c '%s'" % number_cores bootstrap_args += " -d '%s'" % debug_level bootstrap_args += " -g '%s'" % virtenv bootstrap_args += " -j '%s'" % task_launch_method bootstrap_args += " -k '%s'" % mpi_launch_method bootstrap_args += " -l '%s'" % lrms bootstrap_args += " -m '%s'" % database_hostport bootstrap_args += " -n '%s'" % database_name bootstrap_args += " -o '%s'" % agent_spawner bootstrap_args += " -p '%s'" % pilot_id bootstrap_args += " -q '%s'" % agent_scheduler bootstrap_args += " -r '%s'" % runtime bootstrap_args += " -s '%s'" % session_uid bootstrap_args += " -t '%s'" % agent_type bootstrap_args += " -u '%s'" % virtenv_mode bootstrap_args += " -v '%s'" % rp_version # set optional args if database_auth: bootstrap_args += " -a '%s'" % database_auth if tunnel_bind_device: bootstrap_args += " -D '%s'" % tunnel_bind_device if pre_bootstrap: bootstrap_args += " -e '%s'" % "' -e '".join(pre_bootstrap) if forward_tunnel_endpoint: bootstrap_args += " -f '%s'" % forward_tunnel_endpoint if python_interpreter: bootstrap_args += " -i '%s'" % python_interpreter if cleanup: bootstrap_args += " -x '%s'" % cleanup # ------------------------------------------------------ # now that the script is in place and we know where it is, # we can launch the agent js_url = saga.Url(js_endpoint) logger.debug("saga.job.Service ('%s')" % js_url) if js_url in self._shared_worker_data["job_services"]: js = self._shared_worker_data["job_services"][js_url] else: js = saga.job.Service(js_url, session=self._session) self._shared_worker_data["job_services"][js_url] = js # ------------------------------------------------------ # Create SAGA Job description and submit the pilot job jd = saga.job.Description() jd.executable = "/bin/bash" jd.arguments = ["-l pilot_bootstrapper.sh", bootstrap_args] jd.working_directory = saga.Url(pilot_sandbox).path jd.project = project jd.output = "agent.out" jd.error = "agent.err" jd.total_cpu_count = number_cores jd.wall_time_limit = runtime jd.total_physical_memory = memory jd.queue = queue # Set the SPMD variation only if required if spmd_variation: jd.spmd_variation = spmd_variation if "RADICAL_PILOT_PROFILE" in os.environ: jd.environment = {"RADICAL_PILOT_PROFILE": "TRUE"} logger.debug("Bootstrap command line: %s %s" % (jd.executable, jd.arguments)) msg = "Submitting SAGA job with description: %s" % str(jd.as_dict()) logentries.append(Logentry(msg, logger=logger.debug)) pilotjob = js.create_job(jd) pilotjob.run() # do a quick error check if pilotjob.state == saga.FAILED: raise RuntimeError("SAGA Job state is FAILED.") saga_job_id = pilotjob.id self._shared_worker_data["job_ids"][pilot_id] = [saga_job_id, js_url] msg = "SAGA job submitted with job id %s" % str(saga_job_id) logentries.append(Logentry(msg, logger=logger.debug)) # # ------------------------------------------------------ log_dicts = list() for le in logentries: log_dicts.append(le.as_dict()) # Update the Pilot's state to 'PENDING_ACTIVE' if SAGA job submission was successful. ts = datetime.datetime.utcnow() ret = pilot_col.update( {"_id": pilot_id, "state": "Launching"}, { "$set": {"state": PENDING_ACTIVE, "saga_job_id": saga_job_id}, "$push": {"statehistory": {"state": PENDING_ACTIVE, "timestamp": ts}}, "$pushAll": {"log": log_dicts}, }, ) if ret["n"] == 0: # could not update, probably because the agent is # running already. Just update state history and # jobid then # FIXME: make sure of the agent state! ret = pilot_col.update( {"_id": pilot_id}, { "$set": {"saga_job_id": saga_job_id}, "$push": {"statehistory": {"state": PENDING_ACTIVE, "timestamp": ts}}, "$pushAll": {"log": log_dicts}, }, ) except Exception as e: # Update the Pilot's state 'FAILED'. out, err, log = self._get_pilot_logs(pilot_col, pilot_id) ts = datetime.datetime.utcnow() # FIXME: we seem to be unable to bson/json handle saga # log messages containing an '#'. This shows up here. # Until we find a clean workaround, make log shorter and # rely on saga logging to reveal the problem. msg = "Pilot launching failed! (%s)" % e logentries.append(Logentry(msg)) log_dicts = list() log_messages = list() for le in logentries: log_dicts.append(le.as_dict()) log_messages.append(le.message) pilot_col.update( {"_id": pilot_id, "state": {"$ne": FAILED}}, { "$set": {"state": FAILED, "stdout": out, "stderr": err, "logfile": log}, "$push": {"statehistory": {"state": FAILED, "timestamp": ts}}, "$pushAll": {"log": log_dicts}, }, ) logger.exception("\n".join(log_messages)) except SystemExit as e: logger.exception("pilot launcher thread caught system exit -- forcing application shutdown") import thread thread.interrupt_main()