def sm_sync(self): '''Resynchronize with the script manager''' # get this cache first -- it's no problem if this data is old, but bad things # happen when this data is newer than the list of running processes in scriptm self.lock.acquire() try: process_groups_cache = self.process_groups.values() except: self.logger.error("error copying process_groups.values()", exc_info=True) self.lock.release() try: pgroups = ComponentProxy("script-manager").get_jobs([{'id':'*', 'state':'running'}]) except (ComponentLookupError, xmlrpclib.Fault): self.logger.error("Failed to communicate with script manager") return live = [item['id'] for item in pgroups] for each in process_groups_cache: if each.mode == 'script' and each.script_id not in live: self.logger.info("Found dead pg for script job %s" % (each.script_id)) result = ComponentProxy("script-manager").wait_jobs([{'id':each.script_id, 'exit_status':'*'}]) self.logger.info("wait returned %r" % result) for r in result: which_one = None if r['id'] == each.script_id: each.exit_status = r['exit_status'] self.reserve_resources_until(each.location, None, each.jobid)
def event_driver(self): """core part that drives the clock""" if self.go_next: #only if the go_next tag is true will the clock be incremented. enable scheduler schedule multiple job at the same time stamp self.clock_increment() machine = self.get_current_event_machine() # print "[%s]: %s, machine=%s, event=%s, job=%s" % ( # self.implementation, # self.get_current_date_time(), # self.get_current_event_machine(), # self.get_current_event_type(), # self.get_current_event_job(), # ) if machine == INTREPID: self.bgsched.schedule_jobs() util = ComponentProxy("queue-manager").get_util() self.log_info(util, "mira_util_mesh") if machine == EUREKA: self.csched.schedule_jobs() if self.go_next: ComponentProxy("queue-manager").calc_loss_of_capacity()
def signal_process_groups(self, specs, signame="SIGINT"): my_process_groups = self.process_groups.q_get(specs) for pg in my_process_groups: if pg.exit_status is None: if pg.mode == "script": try: ComponentProxy("script-manager").signal_jobs( [{ 'id': pg.script_id }], signame) except (ComponentLookupError, xmlrpclib.Fault): self.logger.error( "Failed to communicate with script manager when killing job" ) else: try: ComponentProxy("forker").signal(pg.head_pid, signame) except: self.logger.error( "Failed to communicate with forker when signalling job" ) if signame == "SIGKILL" and not pg.true_mpi_args: self._mark_partition_for_cleaning(pg.location[0], pg.jobid) return my_process_groups
def _get_exit_status (self): try: running = ComponentProxy("forker").active_list() except: self.logger.error("failed to contact forker component for list of running jobs") return for each in self.process_groups.itervalues(): if each.head_pid not in running and each.exit_status is None: # FIXME: i bet we should consider a retry thing here -- if we fail enough times, just # assume the process is dead? or maybe just say there's no exit code the first time it happens? # maybe the second choice is better try: dead_dict = ComponentProxy("forker").get_status(each.head_pid) except Queue.Empty: self.logger.error("failed call for get_status from forker component for pg %s", each.head_pid) return if dead_dict is None: self.logger.info("process group %i: job %s/%s exited with unknown status", each.id, each.jobid, each.user) each.exit_status = 1234567 else: each.exit_status = dead_dict["exit_status"] if dead_dict["signum"] == 0: self.logger.info("process group %i: job %s/%s exited with status %i", each.id, each.jobid, each.user, each.exit_status) else: if dead_dict["core_dump"]: core_dump_str = ", core dumped" else: core_dump_str = "" self.logger.info("process group %i: job %s/%s terminated with signal %s%s", each.id, each.jobid, each.user, dead_dict["signum"], core_dump_str)
def _get_exit_status(self): #common to bgsystem running = [] active_forker_components = [] for forker_component in ['bg_mpirun_forker', 'user_script_forker']: try: running.extend( ComponentProxy(forker_component).active_list( "process group")) active_forker_components.append(forker_component) except: self.logger.error( "failed to contact %s component for list of running jobs", forker_component) for each in self.process_groups.itervalues(): if each.head_pid not in running and each.exit_status is None and each.forker in active_forker_components: # FIXME: i bet we should consider a retry thing here -- if we fail enough times, just # assume the process is dead? or maybe just say there's no exit code the first time it happens? # maybe the second choice is better try: if each.head_pid != None: dead_dict = ComponentProxy(each.forker).get_status( each.head_pid) else: dead_dict = None except: self.logger.error( "%s: RPC to get_status method in %s component failed", each.label, each.forker) return if dead_dict is None: self.logger.info("%s: job exited with unknown status", each.label) # FIXME: should we use a negative number instead to indicate internal errors? --brt each.exit_status = 1234567 else: each.exit_status = dead_dict["exit_status"] if dead_dict["signum"] == 0: self.logger.info("%s: job exited with status %i", each.label, each.exit_status) else: if dead_dict["core_dump"]: core_dump_str = ", core dumped" else: core_dump_str = "" self.logger.info("%s: terminated with signal %s%s", each.label, dead_dict["signum"], core_dump_str) self.reserve_resources_until(each.location, None, each.jobid)
def check_dependencies(dependency_string): if dependency_string.lower() == 'none': #we are removing all job dependencies. print "Removing job dependencies" return deps = set(dependency_string.split(":")) query = [] for dep in deps: try: query.append({"jobid": int(dep)}) except: pass jobs = ComponentProxy("queue-manager").get_jobs(query) job_ids = set( [str(j["jobid"]) for j in jobs] ) missing = deps.difference(job_ids) if missing: print "WARNING: dependencies %s do not match jobs currently in the "\ "queue" % ":".join(missing)
def check_reservations(self): ret = "" reservations = self.reservations.values() for i in range(len(reservations)): for j in range(i + 1, len(reservations)): # if at least one reservation is cyclic, we want *that* reservation to be the one getting its overlaps method # called if reservations[i].cycle is not None: res1 = reservations[i] res2 = reservations[j] else: res1 = reservations[j] res2 = reservations[i] # we subtract a little bit because the overlaps method isn't really meant to do this # it will report warnings when one reservation starts at the same time another ends if res1.overlaps(res2.start, res2.duration - 0.00001): # now we need to check for overlap in space results = ComponentProxy(self.COMP_SYSTEM).get_partitions( [{ 'name': p, 'children': '*', 'parents': '*' } for p in res2.partitions.split(":")]) for p in res1.partitions.split(":"): for r in results: if p == r['name'] or p in r['children'] or p in r[ 'parents']: ret += "Warning: reservation '%s' overlaps reservation '%s'\n" % ( res1.name, res2.name) return ret
def q_add(self, *args, **kwargs): '''Add a reservation to tracking. Side Efffects: -Add a queue to be tracked -If no cqm associated queue, create a reservation queue -set policies for new queue -emit numerous creation messages ''' qm = ComponentProxy("queue-manager") try: queues = [spec['name'] for spec in qm.get_queues([{'name': "*"}])] except ComponentLookupError: logger.error( "unable to contact queue manager when adding reservation") raise try: specs = args[0] for spec in specs: if "res_id" not in spec or spec['res_id'] == '*': spec['res_id'] = bgsched_id_gen.get() reservations = Cobalt.Data.DataDict.q_add(self, *args, **kwargs) except KeyError, err: raise ReservationError( "Error: a reservation named %s already exists" % err)
def add_process_groups(self, specs): """Create a process group. Arguments: spec -- dictionary hash specifying a process group to start """ self.logger.info("add_process_groups(%r)" % (specs)) script_specs = [] other_specs = [] for spec in specs: if spec.get('mode', False) == "script": script_specs.append(spec) else: other_specs.append(spec) # start up script jobs script_pgroups = [] if script_specs: for spec in script_specs: try: self._set_kernel( spec.get('location')[0], spec.get('kernel', "default")) except Exception, e: new_pgroup = self.process_groups.q_add([spec]) pgroup = new_pgroup[0] pgroup.nodect = self._partitions[pgroup.location[0]].size pgroup.exit_status = 1 self.logger.info( "process group %s: job %s/%s failed to set the kernel; %s", pgroup.id, pgroup.jobid, pgroup.user, e) else: try: script_pgroup = ComponentProxy( "script-manager").add_jobs([spec]) except (ComponentLookupError, xmlrpclib.Fault): self._clear_kernel(spec.get('location')[0]) # FIXME: jobs that were already started are not reported raise ProcessGroupCreationError( "system::add_process_groups failed to communicate with script-manager" ) new_pgroup = self.process_groups.q_add([spec]) pgroup = new_pgroup[0] pgroup.script_id = script_pgroup[0]['id'] pgroup.nodect = self._partitions[pgroup.location[0]].size self.logger.info( "job %s/%s: process group %s created to track script", pgroup.jobid, pgroup.user, pgroup.id) self.reserve_resources_until( spec['location'], time.time() + 60 * float(spec['walltime']), pgroup.jobid) if pgroup.kernel != "default": self.logger.info( "process group %s: job %s/%s using kernel %s", pgroup.id, pgroup.jobid, pgroup.user, pgroup.kernel) script_pgroups.append(pgroup)
def start(self): """Start the process group by forking to _mpirun()""" try: data = self.prefork() self.head_pid = ComponentProxy("forker").fork(data) except: self.logger.error( "problem forking: pg %s did not find a child pid", self.id)
def __init__(self, *args, **kwargs): BGSched.__init__(self, *args, **kwargs) self.get_current_time = ComponentProxy("event-manager").get_current_time self.COMP_QUEUE_MANAGER = "cluster-queue-manager" self.COMP_SYSTEM = "cluster-system" self.queues = Cobalt.Components.bgsched.QueueDict(self.COMP_QUEUE_MANAGER) self.jobs = Cobalt.Components.bgsched.JobDict(self.COMP_QUEUE_MANAGER) self.running_job_walltime_prediction = False
def unregister_with_slp(self): try: name = self.instance.name except AttributeError: return try: ComponentProxy("service-location").unregister(name) except Exception, e: self.logger.error("unregister_with_slp() [%s]" % (e))
class QueueDict(ForeignDataDict): """Dictionary for the queue metadata cache. """ item_cls = Queue key = 'name' __oserror__ = Cobalt.Util.FailureMode("QM Connection (queue)") __function__ = ComponentProxy("queue-manager").get_queues __fields__ = ['name', 'state', 'policy', 'priority']
def _run_reservation_jobs(self, reservations_cache): # handle each reservation separately, as they shouldn't be competing for resources for cur_res in reservations_cache.itervalues(): #print "trying to run res jobs in", cur_res.name, self.started_jobs queue = cur_res.queue if not (self.queues.has_key(queue) and self.queues[queue].state == 'running'): continue temp_jobs = self.jobs.q_get([{ 'is_runnable': True, 'queue': queue }]) active_jobs = [] for j in temp_jobs: if not self.started_jobs.has_key( j.jobid) and cur_res.job_within_reservation(j): active_jobs.append(j) if not active_jobs: continue active_jobs.sort(self.utilitycmp) job_location_args = [] for job in active_jobs: job_location_args.append({ 'jobid': str(job.jobid), 'nodes': job.nodes, 'queue': job.queue, 'required': cur_res.partitions.split(":"), 'utility_score': job.score, 'walltime': job.walltime, 'attrs': job.attrs, 'user': job.user, }) # there's no backfilling in reservations try: best_partition_dict = ComponentProxy( self.COMP_SYSTEM).find_job_location(job_location_args, []) except: self.logger.error("failed to connect to system component") best_partition_dict = {} for jobid in best_partition_dict: job = self.jobs[int(jobid)] self._start_job(job, best_partition_dict[jobid], {str(job.jobid): cur_res.res_id})
def add_jobs (self, specs): """Add a job to the process manager.""" self.logger.info("add_jobs(%r)" % (specs)) jobs = self.jobs.q_add(specs) system_specs = \ ComponentProxy("system").add_jobs([job.to_rx() for job in jobs]) for system_spec in system_specs: job = self.jobs[system_spec['id']] job.state = "running" return jobs
def register_with_slp(self): try: name = self.instance.name except AttributeError: self.logger.error("register_with_slp() [unknown component]") return try: ComponentProxy("service-location").register(name, self.url) except Exception, e: self.logger.error("register_with_slp() [%s]" % (e))
def signal_process_groups (self, specs, signame="SIGINT"): my_process_groups = self.process_groups.q_get(specs) for pg in my_process_groups: if pg.exit_status is None: try: ComponentProxy("forker").signal(pg.head_pid, signame) except: self.logger.error("Failed to communicate with forker when signalling job") return my_process_groups
def __init__(self, *args, **kwargs): BGSched.__init__(self, *args, **kwargs) self.get_current_time = ComponentProxy("event-manager").get_current_time predict_scheme = kwargs.get("predict", False) if predict_scheme: self.running_job_walltime_prediction = bool(int(predict_scheme[2])) else: self.running_job_walltime_prediction = False
def start(self): """Start the process group by contact the appropriate forker component""" try: data = self.prefork() self.head_pid = ComponentProxy(self.forker, retry=False).fork([self.executable] + self.args, self.tag, "Job %s/%s/%s" %(self.jobid, self.user, self.id), self.env, data, self.runid) except: _logger.error("Job %s/%s/%s: problem forking; %s did not return a child id", self.jobid, self.user, self.id, self.forker) raise
def get_mate_jobs_status_local(self, remote_jobid): '''return mate job status, invoked by local functions''' status_dict = {} try: status_dict = ComponentProxy(REMOTE_QUEUE_MANAGER).get_mate_job_status(remote_jobid) except: self.logger.error("failed to connect to remote queue-manager component!") status_dict = {'status':'notconnected'} self.dbglog.LogMessage("failed to connect to remote queue-manager component!") return status_dict
def q_del(self, *args, **kwargs): reservations = Cobalt.Data.DataDict.q_del(self, *args, **kwargs) qm = ComponentProxy('queue-manager') queues = [spec['name'] for spec in qm.get_queues([{'name': "*"}])] spec = [{'name': reservation.queue} for reservation in reservations \ if reservation.createdQueue and reservation.queue in queues and \ not self.q_get([{'queue':reservation.queue}])] try: qm.set_queues(spec, {'state': "dead"}, "bgsched") except Exception, e: logger.error("problem disabling reservation queue (%s)" % e)
def signal(self, signame="SIGINT"): """ Do something with this process group depending on the signal """ logstr = "ProcessGroup:signal:" LOGGER.debug(logstr + "%s:%s" % (self.jobid, signame)) try: if self.local_id: ComponentProxy("forker").signal(self.local_id, signame) except OSError as ose: LOGGER.exception( logstr + "failure for PG %s: %s" \ % (self.id, err))
def _start_job(self, job, partition_list): cqm = ComponentProxy(self.COMP_QUEUE_MANAGER) try: self.logger.info("trying to start job %d on partition %r" % (job.jobid, partition_list)) cqm.run_jobs([{'tag': "job", 'jobid': job.jobid}], partition_list) except ComponentLookupError: self.logger.error("failed to connect to queue manager") return self.started_jobs[job.jobid] = self.get_current_time()
def check_jobs (self): """Finish jobs that are no longer running on the system.""" self.logger.info("check_jobs()") local_job_specs = [job.to_rx(["id"]) for job in self.jobs.values() if job.state != 'finished'] try: system_job_specs = ComponentProxy("system").get_jobs(local_job_specs) except ComponentLookupError: self.logger.error("check_jobs() [unable to contact system]") return system_job_ids = [spec['id'] for spec in system_job_specs] for job in self.jobs.values(): if job.id not in system_job_ids and job.state != "finished": job.state = "finished"
def q_add(self, *args, **kwargs): qm = ComponentProxy(self.COMP_QUEUE_MANAGER) try: queues = [spec['name'] for spec in qm.get_queues([{'name': "*"}])] except ComponentLookupError: logger.error( "unable to contact queue manager when adding reservation") raise try: reservations = Cobalt.Data.DataDict.q_add(self, *args, **kwargs) except KeyError, e: raise ReservationError( "Error: a reservation named %s already exists" % e)
class JobDict(ForeignDataDict): """Dictionary of job metadata from cqm for job location purposes. """ item_cls = Job key = 'jobid' __oserror__ = Cobalt.Util.FailureMode("QM Connection (job)") __function__ = ComponentProxy("queue-manager").get_jobs __fields__ = [ 'nodes', 'location', 'jobid', 'state', 'index', 'walltime', 'queue', 'user', 'submittime', 'starttime', 'project', 'is_runnable', 'is_active', 'has_resources', 'score', 'attrs', 'walltime_p', 'geometry' ]
def update(self, spec): if spec.has_key("users"): qm = ComponentProxy(self.COMP_QUEUE_MANAGER) try: qm.set_queues([{ 'name': self.queue, }], {'users': spec['users']}, "bgsched") except ComponentLookupError: logger.error( "unable to contact queue manager when updating reservation users" ) raise # try the above first -- if we can't contact the queue-manager, don't update the users Data.update(self, spec)
def invoke_mpi_from_script(self, spec): '''Run an mpirun job that was invoked by a script.''' self.state = 'running' stdin = spec.get("stdin", self.stdin) stdout = spec.get("stdout", self.stdout) stderr = spec.get("stderr", self.stderr) try: pgroup = ComponentProxy("system").add_process_groups([{ 'jobid': self.jobid, 'tag': 'process-group', 'user': self.user, 'stdout': stdout, 'stderr': stderr, 'cobalt_log_file': self.cobalt_log_file, 'cwd': self.cwd, 'location': self.location, 'stdin': stdin, 'true_mpi_args': spec['true_mpi_args'], 'env': { 'path': self.path }, 'size': 0, 'args': [], 'executable': "this will be ignored" }]) except (ComponentLookupError, xmlrpclib.Fault): self.log.error("Job %s: Failed to start up user script job" % (self.jobid)) return if not pgroup[0].has_key('id'): self.log.error("Process Group creation failed for Job %s" % self.jobid) self.set('state', 'sm-failure') else: self.mpi_system_id = pgroup[0]['id']
def launch_script(self, config_option, host, jobid, user, group_name): '''Start our script processes used for node prep and cleanup. ''' script = get_cluster_system_config(config_option, None) if script == None: self.logger.error("Job %s/%s: %s not defined in the "\ "cluster_system section of the cobalt config file!", user, jobid, config_option) return None else: cmd = ["/usr/bin/ssh", host, script, str(jobid), user, group_name] return ComponentProxy("system_script_forker").fork(cmd, "system epilogue", "Job %s/%s" % (jobid, user))
def start(self): """ Starts the process group by: 1. Precompiling the data set for the job 2. Calling the forker with the job data 3. Saving the local_id from the forker ### Still not sure about this, future work here... """ #try: data = self.prefork() local_id = ComponentProxy("forker").fork(data) print "****************************************************" print " Local ID is %s" % local_id print "****************************************************" self.local_id = local_id