class Simulator (ClusterBaseSystem): """Generic system simulator. Methods: configure -- load partitions from an xml file reserve_partition -- lock a partition for use by a process_group (exposed) release_partition -- release a locked (busy) partition (exposed) add_process_groups -- add (start) a process group on the system (exposed, query) get_process_groups -- retrieve process groups (exposed, query) wait_process_groups -- get process groups that have exited, and remove them from the system (exposed, query) signal_process_groups -- send a signal to the head process of the specified process groups (exposed, query) update_partition_state -- simulates updating partition state from the bridge API (automatic) """ name = "system" implementation = "cluster_simulator" logger = logger def __init__ (self, *args, **kwargs): ClusterBaseSystem.__init__(self, *args, **kwargs) self.process_groups.item_cls = ClusterProcessGroup def __setstate__(self, state): ClusterBaseSystem.__setstate__(self, state) self.process_groups.item_cls = ClusterProcessGroup def add_process_groups (self, specs): """Create a simulated process group. Arguments: spec -- dictionary hash specifying a process group to start """ self.logger.info("add_process_groups(%r)" % (specs)) process_groups = self.process_groups.q_add(specs) for process_group in process_groups: self.start(process_group) return process_groups add_process_groups = exposed(query(all_fields=True)(add_process_groups)) def get_process_groups (self, specs): """Query process_groups from the simulator.""" return self.process_groups.q_get(specs) get_process_groups = exposed(query(get_process_groups)) def wait_process_groups (self, specs): """get process groups that have finished running.""" self.logger.info("wait_process_groups(%r)" % (specs)) process_groups = [pg for pg in self.process_groups.q_get(specs) if pg.exit_status is not None] for process_group in process_groups: self.logger.info("finished on hosts: %s", Cobalt.Util.merge_nodelist(self.process_groups[process_group.id].location)) for host in self.process_groups[process_group.id].location: self.running_nodes.discard(host) del self.process_groups[process_group.id] return process_groups wait_process_groups = exposed(query(wait_process_groups)) def signal_process_groups (self, specs, signame="SIGINT"): """Simulate the signaling of a process_group.""" self.logger.info("signal_process_groups(%r, %r)" % (specs, signame)) process_groups = self.process_groups.q_get(specs) for process_group in process_groups: process_group.signals.append(signame) return process_groups signal_process_groups = exposed(query(signal_process_groups)) def start (self, process_group): thread.start_new_thread(self._mpirun, (process_group, )) def _mpirun (self, process_group): argv = process_group._get_argv() stdout = open(process_group.stdout or "/dev/null", "a") stderr = open(process_group.stderr or "/dev/null", "a") try: cobalt_log_file = open(process_group.cobalt_log_file or "/dev/null", "a") print >> cobalt_log_file, "%s\n" % " ".join(argv[1:]) cobalt_log_file.close() except: logger.error("Job %s/%s: unable to open cobaltlog file %s" % (process_group.id, process_group.user, process_group.cobalt_log_file)) try: partition = argv[argv.index("-partition") + 1] except ValueError: print >> stderr, "ERROR: '-partition' is a required flag" print >> stderr, "FE_MPI (Info) : Exit status: 1" process_group.exit_status = 1 return except IndexError: print >> stderr, "ERROR: '-partition' requires a value" print >> stderr, "FE_MPI (Info) : Exit status: 1" process_group.exit_status = 1 return try: mode = argv[argv.index("-mode") + 1] except ValueError: print >> stderr, "ERROR: '-mode' is a required flag" print >> stderr, "FE_MPI (Info) : Exit status: 1" process_group.exit_status = 1 return except IndexError: print >> stderr, "ERROR: '-mode' requires a value" print >> stderr, "FE_MPI (Info) : Exit status: 1" process_group.exit_status = 1 return try: size = argv[argv.index("-np") + 1] except ValueError: print >> stderr, "ERROR: '-np' is a required flag" print >> stderr, "FE_MPI (Info) : Exit status: 1" process_group.exit_status = 1 return except IndexError: print >> stderr, "ERROR: '-np' requires a value" print >> stderr, "FE_MPI (Info) : Exit status: 1" process_group.exit_status = 1 return try: size = int(size) except ValueError: print >> stderr, "ERROR: '-np' got invalid value %r" % (size) print >> stderr, "FE_MPI (Info) : Exit status: 1" print >> stdout, "ENVIRONMENT" print >> stdout, "-----------" for key, value in process_group.env.iteritems(): print >> stdout, "%s=%s" % (key, value) print >> stdout print >> stderr, "FE_MPI (Info) : Initializing MPIRUN" print >> stderr, "FE_MPI (Info) : process group with id", process_group.id print >> stderr, "FE_MPI (Info) : Waiting for process_group to terminate" print >> stdout, "Running process_group: %s" % " ".join(argv) start_time = time.time() run_time = random.randint(60, 180) my_exit_status = 0 print "running for about %f seconds" % run_time while time.time() < (start_time + run_time): if "SIGKILL" in process_group.signals: process_group.exit_status = 1 return elif "SIGTERM" in process_group.signals: print >> stderr, "FE_MPI (Info) : ProcessGroup got signal SIGTERM" my_exit_status = 1 break else: time.sleep(1) # tumblers better than pumpers print >> stderr, "FE_MPI (Info) : ProcessGroup", process_group.id, "switched to state TERMINATED ('T')" print >> stderr, "FE_MPI (Info) : ProcessGroup sucessfully terminated" print >> stderr, "BE_MPI (Info) : Releasing partition", partition print >> stderr, "BE_MPI (Info) : Partition", partition, "switched to state FREE ('F')" print >> stderr, "BE_MPI (Info) : BE completed" print >> stderr, "FE_MPI (Info) : FE completed" print >> stderr, "FE_MPI (Info) : Exit status:", my_exit_status process_group.exit_status = my_exit_status def launch_diags(self, partition, test_name): exit_value = 0 for nc in partition.node_cards: if nc.id in self.failed_components: exit_value = 1 for switch in partition.switches: if switch in self.failed_components: exit_value = 2 self.finish_diags(partition, test_name, exit_value)
class Simulator(BGBaseSystem): """Generic system simulator. Methods: configure -- load partitions from an xml file reserve_partition -- lock a partition for use by a process_group (exposed) release_partition -- release a locked (busy) partition (exposed) add_process_groups -- add (start) a process group on the system (exposed, query) get_process_groups -- retrieve process groups (exposed, query) wait_process_groups -- get process groups that have exited, and remove them from the system (exposed, query) signal_process_groups -- send a signal to the head process of the specified process groups (exposed, query) update_partition_state -- simulates updating partition state from the bridge API (automatic) """ name = "system" implementation = "simulator" logger = logger MIN_RUN_TIME = 60 MAX_RUN_TIME = 180 def __init__(self, *args, **kwargs): BGBaseSystem.__init__(self, *args, **kwargs) self.process_groups.item_cls = BGSimProcessGroup self.config_file = kwargs.get("config_file", None) self.failed_components = sets.Set() if self.config_file is not None: self.configure(self.config_file) def __getstate__(self): flags = {} for part in self._partitions.values(): sched = None func = None queue = None if hasattr(part, 'scheduled'): sched = part.scheduled if hasattr(part, 'functional'): func = part.functional if hasattr(part, 'queue'): queue = part.queue flags[part.name] = (sched, func, queue) return { 'managed_partitions': self._managed_partitions, 'version': 2, 'config_file': self.config_file, 'partition_flags': flags } def __setstate__(self, state): self._managed_partitions = state['managed_partitions'] self.config_file = state['config_file'] self._partitions = PartitionDict() self.process_groups = BGProcessGroupDict() self.process_groups.item_cls = BGSimProcessGroup self.node_card_cache = dict() self._partitions_lock = thread.allocate_lock() self.failed_components = sets.Set() self.pending_diags = dict() self.failed_diags = list() self.bridge_in_error = False self.cached_partitions = None self.offline_partitions = [] if self.config_file is not None: self.configure(self.config_file) if 'partition_flags' in state: for pname, flags in state['partition_flags'].items(): if pname in self._partitions: self._partitions[pname].scheduled = flags[0] self._partitions[pname].functional = flags[1] self._partitions[pname].queue = flags[2] else: logger.info("Partition %s is no longer defined" % pname) self.update_relatives() self.lock = threading.Lock() self.statistics = Statistics() def save_me(self): Component.save(self) save_me = automatic(save_me) def configure(self, config_file): """Configure simulated partitions. Arguments: config_file -- xml configuration file """ def _get_node_card(name): if not self.node_card_cache.has_key(name): self.node_card_cache[name] = NodeCard(name) return self.node_card_cache[name] self.logger.info("configure()") try: system_doc = ElementTree.parse(config_file) except IOError: self.logger.error("unable to open file: %r" % config_file) self.logger.error("exiting...") sys.exit(1) except: self.logger.error("problem loading data from file: %r" % config_file) self.logger.error("exiting...") sys.exit(1) system_def = system_doc.getroot() if system_def.tag != "BG": self.logger.error("unexpected root element in %r: %r" % (config_file, system_def.tag)) self.logger.error("exiting...") sys.exit(1) # that 32 is not really constant -- it needs to either be read from cobalt.conf or from the bridge API NODES_PER_NODECARD = 32 # initialize a new partition dict with all partitions # partitions = PartitionDict() tmp_list = [] # this is going to hold partition objects from the bridge (not our own Partition) wiring_cache = {} bp_cache = {} for partition_def in system_def.getiterator("Partition"): if not partition_def.get("name").startswith("ANL"): continue node_list = [] switch_list = [] for nc in partition_def.getiterator("NodeCard"): node_list.append(_get_node_card(nc.get("id"))) nc_count = len(node_list) # remove partitions which have less than 512 nodes if (NODES_PER_NODECARD * nc_count) < 512: continue if not wiring_cache.has_key(nc_count): wiring_cache[nc_count] = [] wiring_cache[nc_count].append(partition_def.get("name")) for s in partition_def.getiterator("Switch"): switch_list.append(s.get("id")) tmp_list.append( dict( name=partition_def.get("name"), queue=partition_def.get("queue", "default"), size=NODES_PER_NODECARD * nc_count, node_cards=node_list, switches=switch_list, state="idle", )) partitions.q_add(tmp_list) # find the wiring deps for size in wiring_cache: for p in wiring_cache[size]: p = partitions[p] s1 = sets.Set(p.switches) for other in wiring_cache[size]: other = partitions[other] if (p.name == other.name): continue s2 = sets.Set(other.switches) if s1.intersection(s2): self.logger.info( "found a wiring dep between %s and %s", p.name, other.name) partitions[p.name]._wiring_conflicts.add(other.name) # update object state self._partitions.clear() self._partitions.update(partitions) print "Total partitions: ", len(self._partitions) def reserve_partition(self, name, size=None): """Reserve a partition and block all related partitions. Arguments: name -- name of the partition to reserve size -- size of the process group reserving the partition (optional) """ try: partition = self.partitions[name] except KeyError: self.logger.error("reserve_partition(%r, %r) [does not exist]" % (name, size)) return False if partition.state != "allocated": self.logger.error("reserve_partition(%r, %r) [%s]" % (name, size, partition.state)) return False if not partition.functional: self.logger.error("reserve_partition(%r, %r) [not functional]" % (name, size)) if size is not None and size > partition.size: self.logger.error("reserve_partition(%r, %r) [size mismatch]" % (name, size)) return False self._partitions_lock.acquire() try: partition.state = "busy" partition.reserved_until = False except: self.logger.error("error in reserve_partition", exc_info=True) self._partitions_lock.release() # explicitly call this, since the above "busy" is instantaneously available self.update_partition_state() self.logger.info("reserve_partition(%r, %r)" % (name, size)) return True reserve_partition = exposed(reserve_partition) def release_partition(self, name): """Release a reserved partition. Arguments: name -- name of the partition to release """ try: partition = self.partitions[name] except KeyError: self.logger.error("release_partition(%r) [already free]" % (name)) return False if not partition.state == "busy": self.logger.info("release_partition(%r) [not busy]" % (name)) return False self._partitions_lock.acquire() try: partition.state = "idle" except: self.logger.error("error in release_partition", exc_info=True) self._partitions_lock.release() # explicitly unblock the blocked partitions self.update_partition_state() self.logger.info("release_partition(%r)" % (name)) return True release_partition = exposed(release_partition) def add_process_groups(self, specs): """Create a simulated process group. Arguments: spec -- dictionary hash specifying a process group to start """ self.logger.info("add_process_groups(%r)" % (specs)) script_specs = [] other_specs = [] for spec in specs: if spec.get('mode') == "script": script_specs.append(spec) else: other_specs.append(spec) # start up script jobs new_pgroups = [] if script_specs: try: for spec in script_specs: script_pgroup = ComponentProxy("script-manager").add_jobs( [spec]) new_pgroup = self.process_groups.q_add([spec]) new_pgroup[0].script_id = script_pgroup[0]['id'] self.reserve_resources_until( spec['location'], time.time() + 60 * float(spec['walltime']), new_pgroup[0].jobid) new_pgroups.append(new_pgroup[0]) except (ComponentLookupError, xmlrpclib.Fault): raise ProcessGroupCreationError( "system::add_process_groups failed to communicate with script-manager" ) process_groups = self.process_groups.q_add(other_specs) for process_group in process_groups: self.start(process_group) return new_pgroups + process_groups add_process_groups = exposed(query(all_fields=True)(add_process_groups)) def get_process_groups(self, specs): """Query process_groups from the simulator.""" return self.process_groups.q_get(specs) get_process_groups = exposed(query(get_process_groups)) def wait_process_groups(self, specs): """get process groups that have finished running.""" self.logger.info("wait_process_groups(%r)" % (specs)) process_groups = [ pg for pg in self.process_groups.q_get(specs) if pg.exit_status is not None ] for process_group in process_groups: # jobs that were launched on behalf of the script manager shouldn't release the partition if not process_group.true_mpi_args: self.reserve_resources_until(process_group.location, None, process_group.jobid) del self.process_groups[process_group.id] return process_groups wait_process_groups = exposed(query(wait_process_groups)) def signal_process_groups(self, specs, signame="SIGINT"): """Simulate the signaling of a process_group.""" self.logger.info("signal_process_groups(%r, %r)" % (specs, signame)) process_groups = self.process_groups.q_get(specs) for process_group in process_groups: if process_group.mode == "script": try: pgroup = ComponentProxy("script-manager").signal_jobs( [{ 'id': process_group.script_id }], "SIGTERM") except (ComponentLookupError, xmlrpclib.Fault): logger.error( "Failed to communicate with script manager when killing job" ) else: process_group.signals.append(signame) return process_groups signal_process_groups = exposed(query(signal_process_groups)) def start(self, process_group): thread.start_new_thread(self._mpirun, (process_group, )) def _mpirun(self, process_group): argv = process_group._get_argv() try: stdout = open(process_group.stdout or "/dev/null", "a") except: stdout = open("/dev/null", "a") try: stderr = open(process_group.stderr or "/dev/null", "a") except: stderr = open("/dev/null", "a") try: clfn = process_group.cobalt_log_file or "/dev/null" cobalt_log_file = open(clfn, "a") print >> cobalt_log_file, "%s\n" % " ".join(argv[1:]) cobalt_log_file.close() except: logger.error("Job %s/%s: unable to open cobaltlog file %s", process_group.id, process_group.user, clfn, exc_info=True) try: partition = argv[argv.index("-partition") + 1] except ValueError: print >> stderr, "ERROR: '-partition' is a required flag" print >> stderr, "FE_MPI (Info) : Exit status: 1" process_group.exit_status = 1 return except IndexError: print >> stderr, "ERROR: '-partition' requires a value" print >> stderr, "FE_MPI (Info) : Exit status: 1" process_group.exit_status = 1 return try: mode = argv[argv.index("-mode") + 1] except ValueError: print >> stderr, "ERROR: '-mode' is a required flag" print >> stderr, "FE_MPI (Info) : Exit status: 1" process_group.exit_status = 1 return except IndexError: print >> stderr, "ERROR: '-mode' requires a value" print >> stderr, "FE_MPI (Info) : Exit status: 1" process_group.exit_status = 1 return try: size = argv[argv.index("-np") + 1] except ValueError: print >> stderr, "ERROR: '-np' is a required flag" print >> stderr, "FE_MPI (Info) : Exit status: 1" process_group.exit_status = 1 return except IndexError: print >> stderr, "ERROR: '-np' requires a value" print >> stderr, "FE_MPI (Info) : Exit status: 1" process_group.exit_status = 1 return try: size = int(size) except ValueError: print >> stderr, "ERROR: '-np' got invalid value %r" % (size) print >> stderr, "FE_MPI (Info) : Exit status: 1" print >> stdout, "ENVIRONMENT" print >> stdout, "-----------" for key, value in process_group.env.iteritems(): print >> stdout, "%s=%s" % (key, value) print >> stdout print >> stderr, "FE_MPI (Info) : Initializing MPIRUN" reserved = self.reserve_partition(partition, size) if not reserved: print >> stderr, "BE_MPI (ERROR): Failed to run process on partition" print >> stderr, "BE_MPI (Info) : BE completed" print >> stderr, "FE_MPI (ERROR): Failure list:" print >> stderr, "FE_MPI (ERROR): - 1. ProcessGroup execution failed - unable to reserve partition", partition print >> stderr, "FE_MPI (Info) : FE completed" print >> stderr, "FE_MPI (Info) : Exit status: 1" process_group.exit_status = 1 return hardware_failure = False for nc in self.partitions[partition].node_cards: if nc.id in self.failed_components: hardware_failure = True break for switch in self.partitions[partition].switches: if switch in self.failed_components: hardware_failure = True break if hardware_failure: excuses = [ "incorrectly polarized packet accelerator", "the Internet is full", "side fumbling detected", "unilateral phase detractors offline", ] print >> stderr, "BE_MPI (ERROR): Booting aborted - partition is in DEALLOCATING ('D') state" print >> stderr, "BE_MPI (ERROR): Partition has not reached the READY ('I') state" print >> stderr, "BE_MPI (Info) : Checking for block error text:" print >> stderr, "BE_MPI (ERROR): block error text '%s.'" % random.choice( excuses) print >> stderr, "BE_MPI (Info) : Starting cleanup sequence" time.sleep(20) self.release_partition(partition) print >> stderr, "BE_MPI (Info) : Partition", partition, "switched to state FREE ('F')" print >> stderr, "FE_MPI (ERROR): Failure list:" print >> stderr, "FE_MPI (ERROR): - 1.", partition, "couldn't boot." print >> stderr, "FE_MPI (Info) : FE completed" print >> stderr, "FE_MPI (Info) : Exit status: 1" process_group.exit_status = 1 return print >> stderr, "FE_MPI (Info) : process group with id", process_group.id print >> stderr, "FE_MPI (Info) : Waiting for process_group to terminate" print >> stdout, "Running process_group: %s" % " ".join(argv) start_time = time.time() run_time = random.randint(self.MIN_RUN_TIME, self.MAX_RUN_TIME) my_exit_status = 0 self.logger.info("process group %d running for about %f seconds", process_group.id, run_time) while time.time() < (start_time + run_time): if "SIGKILL" in process_group.signals: process_group.exit_status = 1 return elif "SIGTERM" in process_group.signals: print >> stderr, "FE_MPI (Info) : ProcessGroup got signal SIGTERM" my_exit_status = 1 break else: time.sleep(1) # tumblers better than pumpers print >> stderr, "FE_MPI (Info) : ProcessGroup", process_group.id, "switched to state TERMINATED ('T')" print >> stderr, "FE_MPI (Info) : ProcessGroup sucessfully terminated" print >> stderr, "BE_MPI (Info) : Releasing partition", partition released = self.release_partition(partition) if not released: print >> stderr, "BE_MPI (ERROR): Partition", partition, "could not switch to state FREE ('F')" print >> stderr, "BE_MPI (Info) : BE completed" print >> stderr, "FE_MPI (Info) : FE completed" print >> stderr, "FE_MPI (Info) : Exit status: 1" process_group.exit_status = 1 return print >> stderr, "BE_MPI (Info) : Partition", partition, "switched to state FREE ('F')" print >> stderr, "BE_MPI (Info) : BE completed" print >> stderr, "FE_MPI (Info) : FE completed" print >> stderr, "FE_MPI (Info) : Exit status:", my_exit_status process_group.exit_status = my_exit_status def update_partition_state(self): # first, set all of the nodecards to not busy for nc in self.node_card_cache.values(): nc.used_by = '' self._partitions_lock.acquire() try: for p in self._partitions.values(): p._update_node_cards() now = time.time() # since we don't have the bridge, a partition which isn't busy # should be set to idle and then blocked states can be derived for p in self._partitions.values(): if p.state != "busy": p.state = "idle" if p.reserved_until and now > p.reserved_until: p.reserved_until = None p.reserved_by = None for p in self._partitions.values(): if p.state == "busy": # when the partition becomes busy, if a script job isn't reserving it, then release the reservation if not p.reserved_by: p.reserved_until = False else: if p.reserved_until: p.state = "allocated" for part in p._parents: if part.state == "idle": part.state = "blocked (%s)" % (p.name, ) for part in p._children: if part.state == "idle": part.state = "blocked (%s)" % (p.name, ) for diag_part in self.pending_diags: if p.name == diag_part.name or p.name in diag_part.parents or p.name in diag_part.children: p.state = "blocked by pending diags" for nc in p.node_cards: if nc.used_by: p.state = "blocked (%s)" % nc.used_by break for dep_name in p._wiring_conflicts: if self._partitions[dep_name].state in [ "allocated", "busy" ]: p.state = "blocked-wiring (%s)" % dep_name break for part_name in self.failed_diags: part = self._partitions[part_name] if p.name == part.name: p.state = "failed diags" elif p.name in part.parents or p.name in part.children: p.state = "blocked by failed diags" except: self.logger.error("error in update_partition_state", exc_info=True) self._partitions_lock.release() update_partition_state = automatic(update_partition_state) def add_failed_components(self, component_names): success = [] for name in component_names: if self.node_card_cache.has_key(name): self.failed_components.add(name) success.append(name) else: for p in self._partitions.values(): if name in p.switches: self.failed_components.add(name) success.append(name) break return success add_failed_component = exposed(add_failed_components) def del_failed_components(self, component_names): success = [] for name in component_names: try: self.failed_components.remove(name) success.append(name) except KeyError: pass return success del_failed_components = exposed(del_failed_components) def list_failed_components(self, component_names): return list(self.failed_components) list_failed_components = exposed(list_failed_components) def launch_diags(self, partition, test_name): exit_value = 0 for nc in partition.node_cards: if nc.id in self.failed_components: exit_value = 1 for switch in partition.switches: if switch in self.failed_components: exit_value = 2 self.finish_diags(partition, test_name, exit_value)
class HeckleSystem(Component): """ Cobalt System component for handling / interacting with Heckle resource manager External Methods: add_process_groups -- allocates nodes get_process_groups -- get process groups based on specs signal_process_groups -- signal a process group wait_process_groups -- removed process groups based on specs Internal Methods: __init__: _start_pg: _check_builds_done: _wait: _release_resources: get_resources: Queue Manager Methods: validate_job: verify_locations: find_job_locations: find_queue_equivalence_classes: """ name = "system" implementation = "HeckleBreadboard" queue_assignments = {} def __init__(self, *args, **kwargs): logger.debug( "heckle: System: init ... %s ... &&&&&&&&&&&&&&&&&&&&&&&&&&&&& I am here as well &&&&&&&&&&&&&&&&&&&&&&&&&" % threading.current_thread().getName()) Component.__init__(self, *args, **kwargs) self.process_groups = ProcessGroupDict() self.process_groups.item_cls = HeckleProcessGroup self.resources = ResourceDict() self.queue_assignments["default"] = self.resources.keys() print "\n\n\n\n" print "Queue assignments are: %s" % self.queue_assignments def __repr__(self): """ printout representation of the class """ indict = self.__dict__ printstr = "" printstr += "Heckle System Object: Values" for element in indict: printstr += str(element) + "::" if indict[element] == None: printstr += "None, " else: printstr += str(indict[element]) + ", " printstr += " Process Groups:" for element in self.process_groups: printstr += str(element) + "::" + str( self.process_groups[element]) + ", " return printstr ##################### # Main set of methods ##################### def add_process_groups(self, specs): """ Allocate nodes and add the list of those allocated to the PGDict specs is a list of dictionaries Each dictionary contains the specifications for all the nodes in the process group """ #Debug - Take out to really rebuild #### Need to check the environment variable for fakebuild try: specs[0]['fakebuild'] = specs[0]['env']['fakebuild'] del specs[0]['env']['fakebuild'] except: pass print "Heckle System: add_process_groups: <<<<<<<<<<<<<<<<<< OK< Debug< This< : %s" % specs HICCUP = HeckleConnector() #try: reservation = HICCUP.make_reservation(**(specs[0])) heckle_res_id = reservation.id uid = specs[0]['user'] logger.debug("Heckle System: heckle_res_id = %i" % heckle_res_id) specs[0]['heckle_res_id'] = heckle_res_id return self.process_groups.q_add( specs, lambda x, _: self._start_pg( x, heckle_res_id=heckle_res_id, uid=uid)) #except Exception as hec_except: ## could do something here about problems ## 1) Kill job, then resubmit job w/o node name(s) ## Would require access to cqadm via api ## 2) Put job / node in fail state ## 3) Simply fail #raise Exception("Heckle System Object: add_process_groups: %s" % hec_except) add_process_groups = exposed(query(add_process_groups)) def get_process_groups(self, specs): """get a list of existing allocations""" #logger.debug( "Heckle System: get_process_groups" ) self._wait() return self.process_groups.q_get(specs) get_process_groups = exposed(query(get_process_groups)) def signal_process_groups(self, specs, sig): """Free the specified process group (set of allocated nodes)""" logger.debug( "Heckle System: signal_process_groups: Specs are %s, sig is %s" % (specs, sig)) return self.process_groups.q_get(specs, lambda x, y: x.signal(y), sig) signal_process_groups = exposed(query(signal_process_groups)) def wait_process_groups(self, specs): """Remove terminated process groups""" logger.debug("Heckle System: wait_process_groups; specs are %s" % specs) return self.process_groups.q_del( specs, lambda x, _: self._release_resources(x)) wait_process_groups = exposed(query(wait_process_groups)) ######################################### # Methods for dealing with Process Groups ######################################### def _start_pg(self, pgp, heckle_res_id, uid): """ Populates the process group with its resources gets node information for nodes in process group Updates those attributes Places nodes in the pinging nodes list, to see if they're built """ logger.debug("Heckle System: start_pg: PGP is %s" % pgp) nodelist = pgp.location for node in nodelist: node_attributes = self.resources[node] node_attributes['mac'] = node_attributes['mac'].replace("-", ":") node_attributes['heckle_res_id'] = heckle_res_id pgp.resource_attributes[node] = node_attributes._get_dict() pgp.uid = uid pgp.pinging_nodes.append(nodelist) add_process_groups = exposed(query(add_process_groups)) def _check_builds_done(self): """ Check to see if the nodes are done building Starts the process group if all nodes in them are done building """ #logger.debug( "heckle: System: Check Build Done: Waiting to Start..." ) #sleep(20) retval = True pg_list = [ x for x in self.process_groups.itervalues() if (len(x.pinging_nodes) > 0) ] self.resources.update() for pgp in pg_list: for nodename in pgp.pinging_nodes: teststr = self.resources[nodename]['bootstate'] if teststr == "COMPLETED": logger.debug( "heckle: System: Check Build Done: Removing node %s...%i pinging nodes left" % (nodename, len(pgp.pinging_nodes) - 1)) pgp.pinging_nodes.remove(nodename) elif teststr in ["BOOTING", "", ""]: logger.debug( "Heckle System: Check Build Done: Node %s not done yet." % nodename) elif teststr == "UNALLOCATED": raise Exception( "HIC_SO: _check_builds_done: Node says, 'UNALLOCATED'. Possible build error, or system timed out." ) elif teststr == "CRITFAIL": raise Exception( "HIC_SO: _check_builds_done: Node says, 'CRITFAIL'. It timed out while building." ) ##################### #### Need to figure a better way to fail gracefully on this one... ##################### elif teststr == "READY": raise Exception( "HIC_SO: _check_builds_done: Node says, 'READY'. The Heckle Reservation is already ready already, skipping pinging." ) if len(pgp.pinging_nodes) == 0: logger.debug( "Heckle System: Check Build Done: No Pinging Nodes left, Start PG Running." ) pgp.start() else: retval = False return retval _check_builds_done = automatic(_check_builds_done) def _wait(self): """ Calls the process group container's wait() method """ #logger.debug( "Heckle System: wait" ) for pgp in self.process_groups.itervalues(): pgp.wait() _wait = automatic(_wait) def _release_resources(self, pgp): """ Releases all the Heckle nodes, unreserving them """ logger.debug("Heckle System: Release %s" % pgp.location) #self.resources[pgp.location]['action']='Free' HICCUP = HeckleConnector() HICCUP.free_reserved_node(uid=pgp.uid, node_list=pgp.location) self.resources.free(nodes=pgp.location) def get_resources(self, specs={}): """ Returns a list of names for all the FREE resources (nodes) which match the given specs. """ logger.debug("Heckle System: get Resources, specs are %s" % specs) ################################## ### Look at this as a future change ################################## specs['current reservation'] = 9999999 specs['allocatable'] = 'True' res_list = self.resources >= specs logger.debug("Heckle System: get Resources, resources are %s" % res_list) return res_list get_resources = exposed(query(get_resources)) ########################################################## # Methods for interacting with scheduler and queue-manager ########################################################## def validate_job(self, spec): """ Validates a job for submission -- will the job ever run under the current Heckle configuration? Steps: 1) Validate Kernel 2) Validate HW 3) Validate Job versus overall """ logger.debug("Heckle System: Validate Job: Specs are %s" % spec) try: checklist = spec['attrs'] except: checklist = {} #del(checklist['action']) try: nodecount = spec['nodecount'] except: nodecount = 1 glossary = self.resources.glossary dnelist = [] # for attributes which do not exist in glossary badlist = [] # for attributes in glossary which do not exist ################################## ### Look at this as a future change ### Think: Refresh Resources Info ################################## #1st step: Are there enough nodes at all? if nodecount >= self.resources.node_count(): pass else: raise Exception( "Validate Job: Not enough nodes; Requested %s, only have %s in the system." % (nodecount, self.resources.nodecount())) for att in checklist: val = checklist[att] try: if val in glossary[att]: pass else: badlist.append("%s:%s" % (att, val)) # Bad attribute except: dnelist.append(att) #Attribute does not exist checklist['current reservation'] = 9999999 checklist['allocatable'] = 'True' retlist = self.resources >= checklist retcount = len(retlist) goodlen = retcount >= nodecount if goodlen and not badlist and not dnelist: return spec #Good Job! else: retstr = "Validate Job: " if badlist or dnelist: if badlist: restr += "No value for attribute: %s" % badlist if dnelist: retstr += "Attributes Do Not Exist: %s" % dnelist else: retstr += "Need %s nodes, only have %s nodes: %s" % ( nodecount, retcount, retlist) raise Exception(retstr) return spec validate_job = exposed(validate_job) def verify_locations(self, location_list): """ Makes sure a location list is valid location list is a list of fully qualified strings of node names ex: nodename.mcs.anl.gov """ logger.debug("heckle: System: Validate Job: Verify Locations") return location_list in self.resources.glossary verify_locations = exposed(verify_locations) def find_job_location(self, job_location_args, end_times): """ Finds a group of not-busy nodes in which to run the job Arguments: job_location_args -- A list of dictionaries with info about the job jobid -- string identifier nodes -- int number of nodes queue -- string queue name required -- ?? utility_score -- ?? threshold -- ?? walltime -- ?? attrs -- dictionary of attributes to match against end_times -- supposed time the job will end Returns: Dictionary with list of nodes a job can run on, keyed by jobid """ logger.debug("heckle: System: find_job_location") locations = {} def jobsort(job): """Used to sort job list by utility score""" return job["utility_score"] job_location_args.sort(key=jobsort) #Try to match jobs to nodes which can run them for job in job_location_args: if "attrs" not in job or job["attrs"] is None: attrs = {} else: attrs = job['attrs'] attrs['current reservation'] = 9999999 attrs['allocatable'] = 'True' nodecount = int(job['nodes']) print "Heckle System: Find Job Location: Job is %s" % job ############################# ### Look at this as point of change ### Think: For node in unreserved nodes ### Choose node from list ### Remove node from unreserved nodes ############################# print "Heckle System: Find Job Location: Free Nodes is %s" % self.resources.getfreenodes( ) nodelist = (self.resources >= attrs) # get Matching Node print "Nodelist at this stage is %s" % nodelist if len(nodelist) >= nodecount: print "Nodecount = %s" % nodecount retlist = nodelist[:nodecount] self.resources.allocate(retlist) print "Heckle System: Find Job Location: Remaining nodelist is %s" % retlist else: raise Exception( "Heckle System: find_job_locations: Not Enough matching Nodes Available" ) locations[job["jobid"]] = retlist print "Locations is now: %s" % locations logger.info("heckle: find_job_location: locations are %s" % locations) return locations find_job_location = exposed(find_job_location) def find_queue_equivalence_classes(self, reservation_dict, active_queue_names): """ Finds equivalent queues An equivalent queue is a queue which can run upon the same partition(s) For now, with one partition (everything!) this is irrelevant. Returns: equiv= [{'reservations': [], 'queues': ['default']}] """ logger.debug("Heckle System: find queue equivalence classes") equiv = [] #print "Reservation_Dict is: %s" % reservation_dict #print "Active_queue_names is %s" % active_queue_names #print "Queue assignments are: %s" % self.queue_assignments for queue in self.queue_assignments: # skip queues that aren't running if not queue in active_queue_names: continue found_a_match = False print "Heckle Queue is %s" % queue for equ in equiv: print "Heckle Equ is %s" % equ if equ['data'].intersection(self.queue_assignments[queue]): equ['queues'].add(queue) equ['data'].update(self.queue_assignments[queue]) found_a_match = True break if not found_a_match: equiv.append({ 'queues': set([queue]), 'data': set(self.queue_assignments[queue]), 'reservations': set() }) real_equiv = [] for eq_class in equiv: found_a_match = False for equ in real_equiv: if equ['queues'].intersection(eq_class['queues']): equ['queues'].update(eq_class['queues']) equ['data'].update(eq_class['data']) found_a_match = True break if not found_a_match: real_equiv.append(eq_class) equiv = real_equiv for eq_class in equiv: for res_name in reservation_dict: for host_name in reservation_dict[res_name].split(":"): if host_name in eq_class['data']: eq_class['reservations'].add(res_name) for key in eq_class: eq_class[key] = list(eq_class[key]) del eq_class['data'] return equiv find_queue_equivalence_classes = exposed(find_queue_equivalence_classes) def get_partitions(self, locations): """ Work-around to get the cqadm to run a single job on this system PRE: locations is a list of dict of strings of possible node names POST: if good, return locations if not good, raise exception and list bad nodes """ nodelist = self.resources.Glossary.nodelist logger.debug("Heckle System: get_partitions: raw is are: %s" % locations) logger.debug("Heckle System: get_partitions: vals are: %s" % locs) if locations in nodelist: return locations else: raise Exception( "heckle: System: get_partition: Bad Locations: %s " % badlocations) get_partitions = exposed(get_partitions)
class ProcessManager (Component): """Generic implementation of process-manager Methods: add_jobs -- add jobs to the process manager (exposed) get_jobs -- query jobs from the process manager (exposed) wait_jobs -- return and remove finished jobs (exposed) signal_jobs -- send a signal to jobs (exposed) check_jobs -- finish jobs that are no longer running on the system (automatic) """ name = "process-manager" logger = logger def __init__ (self, **kwargs): Component.__init__(self, **kwargs) self.jobs = JobDict() def add_jobs (self, specs): """Add a job to the process manager.""" self.logger.info("add_jobs(%r)" % (specs)) jobs = self.jobs.q_add(specs) system_specs = \ ComponentProxy("system").add_jobs([job.to_rx() for job in jobs]) for system_spec in system_specs: job = self.jobs[system_spec['id']] job.state = "running" return jobs add_jobs = exposed(query(add_jobs)) def get_jobs (self, specs): """Query jobs from the process mananger.""" self.logger.info("get_jobs(%r)" % (specs)) return self.jobs.q_get(specs) get_jobs = exposed(query(get_jobs)) def wait_jobs (self, specs): """Removes and returns jobs that have finished.""" self.logger.info("wait_jobs(%r)" % (specs)) specs = [spec.copy() for spec in specs] for spec in specs: spec['state'] = "finished" return self.jobs.q_del(specs) wait_jobs = exposed(query(wait_jobs)) def signal_jobs (self, specs, signame="SIGTERM"): """Send a signal to existing job processes.""" self.logger.info("signal_jobs(%r, %r)" % (specs, signame)) return ComponentProxy("system").signal_jobs(specs, signame) signal_jobs = exposed(signal_jobs) def check_jobs (self): """Finish jobs that are no longer running on the system.""" self.logger.info("check_jobs()") local_job_specs = [job.to_rx(["id"]) for job in self.jobs.values() if job.state != 'finished'] try: system_job_specs = ComponentProxy("system").get_jobs(local_job_specs) except ComponentLookupError: self.logger.error("check_jobs() [unable to contact system]") return system_job_ids = [spec['id'] for spec in system_job_specs] for job in self.jobs.values(): if job.id not in system_job_ids and job.state != "finished": job.state = "finished" check_jobs = automatic(check_jobs)
class ServiceLocator(Component): """Generic implementation of the service-location component. Methods: register -- register a service (exposed) unregister -- remove a service from the registry (exposed) locate -- retrieve the location of a service (exposed) get_services -- part of the query interface from DataSet (exposed) """ name = "service-location" # A default logger for the class is placed here. # Assigning an instance-level logger is supported, # and expected in the case of multiple instances. logger = logging.getLogger("Cobalt.Components.ServiceLocator") def __init__(self, *args, **kwargs): """Initialize a new ServiceLocator. All arguments are passed to the component constructor. """ Component.__init__(self, *args, **kwargs) self.services = ServiceDict() def register(self, service_name, location): """Register the availability of a service. Arguments: service_name -- name of the service to register location -- location of the service """ try: service = self.services[service_name] except KeyError: service = Service(dict(name=service_name, location=location)) self.services[service_name] = service self.logger.info("register(%r, %r)" % (service_name, location)) else: service.location = location service.touch() register = exposed(register) def unregister(self, service_name): """Remove a service from the registry. Arguments: service_name -- name of the service to remove """ try: del self.services[service_name] except KeyError: self.logger.info("unregister(%r) [not registered]" % (service_name)) else: self.logger.info("unregister(%r)" % (service_name)) unregister = exposed(unregister) def locate(self, service_name): """Retrieve the location for a service. Arguments: service_name -- name of the service to look up """ try: service = self.services[service_name] except KeyError: self.logger.debug("locate(%r) [not registered]" % (service_name)) return "" return service.location locate = exposed(locate) def get_services(self, specs): """Query interface "Get" method.""" return self.services.q_get(specs) get_services = exposed(query(get_services))
# longer active. an error message should also be attached to the process group so that cqm can report the # problem to the user. pgroup.exit_status = 1 self.logger.info( "process group %s: job %s/%s failed to set the kernel; %s", pgroup.id, pgroup.jobid, pgroup.user, e) else: if pgroup.kernel != "default" and not pgroup.true_mpi_args: self.logger.info( "process group %s: job %s/%s using kernel %s", pgroup.id, pgroup.jobid, pgroup.user, pgroup.kernel) pgroup.start() return script_pgroups + process_groups add_process_groups = exposed(query(add_process_groups)) def get_process_groups(self, specs): self._get_exit_status() return self.process_groups.q_get(specs) get_process_groups = exposed(query(get_process_groups)) def _get_exit_status(self): try: running = ComponentProxy("forker").active_list() except: self.logger.error( "failed to contact forker component for list of running jobs") return
# FIXME: setting exit_status to signal the job has failed isn't really the right thing to do. another flag # should be added to the process group that wait_process_group uses to determine when a process group is no # longer active. an error message should also be attached to the process group so that cqm can report the # problem to the user. pgroup.exit_status = 1 self.logger.info("process group %s: job %s/%s failed to set the kernel; %s", pgroup.id, pgroup.jobid, pgroup.user, e) else: if pgroup.kernel != "default" and not pgroup.true_mpi_args: self.logger.info("process group %s: job %s/%s using kernel %s", pgroup.id, pgroup.jobid, pgroup.user, pgroup.kernel) pgroup.start() return script_pgroups + process_groups add_process_groups = exposed(query(add_process_groups)) def get_process_groups (self, specs): self._get_exit_status() return self.process_groups.q_get(specs) get_process_groups = exposed(query(get_process_groups)) def _get_exit_status (self): try: running = ComponentProxy("forker").active_list() except: self.logger.error("failed to contact forker component for list of running jobs") return for each in self.process_groups.itervalues(): if each.head_pid not in running and each.exit_status is None and each.mode != "script":
_get_exit_status = automatic( _get_exit_status, float(get_cluster_system_config('get_exit_status_interval', 10))) def wait_process_groups(self, specs): self._get_exit_status() process_groups = [ pg for pg in self.process_groups.q_get(specs) if pg.exit_status is not None ] for process_group in process_groups: self.clean_nodes(process_group.location, process_group.user, process_group.jobid) return process_groups wait_process_groups = locking(exposed(query(wait_process_groups))) def signal_process_groups(self, specs, signame="SIGINT"): my_process_groups = self.process_groups.q_get(specs) for pg in my_process_groups: if pg.exit_status is None: try: ComponentProxy(pg.forker).signal(pg.head_pid, signame) except: self.logger.error( "Failed to communicate with forker when signalling job" ) return my_process_groups signal_process_groups = exposed(query(signal_process_groups))
class ClusterSystem(ClusterBaseSystem): """cluster system component. Methods: configure -- load partitions from the bridge API add_process_groups -- add (start) an mpirun process on the system (exposed, ~query) get_process_groups -- retrieve mpirun processes (exposed, query) wait_process_groups -- get process groups that have exited, and remove them from the system (exposed, query) signal_process_groups -- send a signal to the head process of the specified process groups (exposed, query) update_partition_state -- update partition state from the bridge API (runs as a thread) """ name = "system" implementation = "cluster_system" logger = logger def __init__(self, *args, **kwargs): ClusterBaseSystem.__init__(self, *args, **kwargs) self.process_groups.item_cls = ClusterProcessGroup def __getstate__(self): state = {} state.update(ClusterBaseSystem.__getstate__(self)) # state.update({ # "cluster_system_version": 1 }) return state def __setstate__(self, state): ClusterBaseSystem.__setstate__(self, state) self.process_groups.item_cls = ClusterProcessGroup def add_process_groups(self, specs): """Create a process group. Arguments: spec -- dictionary hash specifying a process group to start """ self.logger.info("add_process_groups(%r)", specs) process_groups = self.process_groups.q_add(specs) for pgroup in process_groups: self.logger.info( "Job %s/%s: process group %s created to track script", pgroup.user, pgroup.jobid, pgroup.id) #System has started the job. We need remove them from the temp, alloc array #in cluster_base_system. self.apg_started = True for pgroup in process_groups: for location in pgroup.location: try: del self.alloc_only_nodes[location] except KeyError: logger.critical( "%s already removed from alloc_only_nodes list", location) return process_groups add_process_groups = exposed(query(add_process_groups)) def get_process_groups(self, specs): self._get_exit_status() return self.process_groups.q_get(specs) get_process_groups = exposed(query(get_process_groups)) def _get_exit_status(self): children = {} cleanup = {} for forker in ['user_script_forker']: try: for child in ComponentProxy(forker).get_children( "process group", None): children[(forker, child['id'])] = child child['pg'] = None cleanup[forker] = [] except ComponentLookupError, e: self.logger.error( "failed to contact the %s component to obtain a list of children", forker) except:
class BBSystem(Component): """Breadboard system component. Methods: add_process_groups -- allocates nodes get_process_groups -- get process groups based on specs signal_process_groups -- signal a process group wait_process_groups -- removed process groups based on specs """ name = "system" implementation = "Breadboard" def __init__(self, *args, **kwargs): Component.__init__(self, *args, **kwargs) self.resources = ResourceDict() self.process_groups = ProcessGroupDict() self.process_groups.item_cls = BBProcessGroup self.queue_assignments = {} self.queue_assignments["default"] = sets.Set(self.resources) ##################### # Main set of methods ##################### def add_process_groups(self, specs): """Allocate nodes and add the list of those allocated to the PGDict""" return self.process_groups.q_add(specs, lambda x, _: self._start_pg(x)) add_process_groups = exposed(query(add_process_groups)) def get_process_groups(self, specs): """Get a list of existing allocations""" self._wait() return self.process_groups.q_get(specs) get_process_groups = exposed(query(get_process_groups)) def signal_process_groups(self, specs, sig): """Free the specified process group (set of allocated nodes)""" return self.process_groups.q_get(specs, lambda x, y: x.signal(y), sig) signal_process_groups = exposed(query(signal_process_groups)) def wait_process_groups(self, specs): """Remove terminated process groups""" return self.process_groups.q_del( specs, lambda x, _: self._release_resources(x)) wait_process_groups = exposed(query(wait_process_groups)) ######################################### # Methods for dealing with Process Groups ######################################### def _start_pg(self, pgp): """Starts a process group by initiating building/rebooting nodes""" ########################################### ### The following is for back-compatibility ### with bballoc (bbtools) until breadboard ### is switched entirely to run on cobalt ########################################### bbdata = bblib.BBConfig("/etc/bb.xml") bbdata.SetNodeAttr(pgp.location, { "user": pgp.user, "state": "Cobalt", "comment": "Managed by Cobalt" }) bbdata.WriteAndClose() ########################################### ### End of back-compatibility ########################################### specs = [{"name": name, "attributes": "*"} for name in pgp.location] resources = self.get_resources(specs) action = "build-%s" % pgp.kernel for res in resources: # Set build action for each resource specs = [{"name": res.name}] new_attrs = {"attributes": {"action": action}} self.set_attributes(specs, new_attrs) mac = res.attributes["mac"] linkname = "/tftpboot/pxelinux.cfg/01-%s" \ % mac.replace(":", "-").lower() if os.readlink(linkname) == action: continue os.unlink(linkname) os.symlink(action, linkname) for res in resources: # Cycle power os.system("/usr/sbin/pm -c %s" % res.name) # Add resource to list of building nodes pgp.building_nodes.append(res.name) def _check_builds_done(self): """Checks if nodes are done building for each process group and scripts can begin running""" for pgp in [ x for x in self.process_groups.itervalues() if (len(x.building_nodes) > 0 or len(x.pinging_nodes) > 0) ]: specs = [{ "name": name, "attributes": "*" } for name in pgp.building_nodes] building = self.get_resources(specs) build_action = "build-%s" % pgp.kernel for node in building: if node.attributes["action"] != build_action: pgp.building_nodes.remove(node.name) pgp.pinging_nodes.append(node.name) for nodename in pgp.pinging_nodes: if os.system("/bin/ping -c 1 -W 1 %s > /dev/null" % nodename): continue pgp.pinging_nodes.remove(nodename) if len(pgp.building_nodes) == 0 and len(pgp.pinging_nodes) == 0: pgp.start() _check_builds_done = automatic(_check_builds_done) def node_done_building(self, node): """Sets a node as done building Arguments: node -- string name of node that is done building Returns: nothing """ specs = [{"name": node, "attributes": "*"}] nodedata = self.get_resources(specs) if len(nodedata) > 0: buildimage = nodedata[0].attributes["action"] nodedata[0].attributes["action"] = buildimage.replace( "build-", "boot-") node_done_building = exposed(node_done_building) def _wait(self): """Calls the process group container's wait() method""" for pgp in self.process_groups.itervalues(): pgp.wait() _wait = automatic(_wait) def _release_resources(self, pgp): """Releases the resources held by a process group""" os.system("/usr/sbin/pm -0 %s" % " ".join(pgp.location)) specs = [{"name": name} for name in pgp.location] new_attrs = {"state": "idle"} self.set_attributes(specs, new_attrs) ########################################### ### The following is for back-compatibility ### with bballoc (bbtools) until breadboard ### is switched entirely to run on cobalt ########################################### bbdata = bblib.BBConfig("/etc/bb.xml") bbdata.SetNodeAttr(pgp.location, {"user": "******"}) bbdata.WriteAndClose() ########################################### ### End of back-compatibility ########################################### #################################### # Methods for dealing with resources #################################### def add_resources(self, specs): """Add a resource to this system Arguments: specs -- A list of dictionaries with the attributes for the resources Returns: list of values added """ try: ret = self.resources.q_add(specs) for res in ret: self.queue_assignments["default"].add(res) except KeyError: ret = "KeyError" return ret add_resources = exposed(query(add_resources)) def remove_resources(self, specs): """Remove a resource from this system Arguments: specs -- A list of dictionaries with the attributes to pick which resources to remove Returns: list of resources removed """ ret = self.resources.q_del(specs) for res in ret: self.queue_assignments["default"].discard(res) return ret remove_resources = exposed(remove_resources) def get_resources(self, specs): """Returns a list of all the resources for this system matching the given specs (list of dictionaries)""" return self.resources.q_get(specs) get_resources = exposed(query(get_resources)) def set_attributes(self, specs, newattrs): """Sets an attribute in specified resources Arguments: specs -- list of dictionaries with resource attributes to match newattrs -- a dictionary with key:val pairs of attributes to set Returns: a list of the changed resources """ return self.resources.q_get( specs, lambda x, y: [set_attr(x, key, val) for key, val in y.iteritems()], newattrs) set_attributes = exposed(query(set_attributes)) def remove_attributes(self, specs, attrs): """Removes other attributes in specified resources Arguments: specs -- list of dictionaries with resource attributes to match attrs -- list of names of attributes to remove from resource.attributes Returns: a list of the changed resources """ return self.resources.q_get( specs, lambda x, y: [rem_attr(x, key) for key in y], attrs) remove_attributes = exposed(query(remove_attributes)) ########################################################## # Methods for interacting with scheduler and queue-manager ########################################################## def validate_job(self, spec): """Validate a job for submission Arguments: spec -- job specification dictionary """ max_nodes = len( self.get_resources([{ "name": "*", "functional": True, "scheduled": True }])) try: spec["nodecount"] = int(spec["nodecount"]) except ValueError: raise JobValidationError("Non-integer node count") if not 0 < spec["nodecount"] <= max_nodes: raise JobValidationError("Node count out of realistic range") if float(spec["time"]) < 15: raise JobValidationError("Walltime less than minimum 15 minutes") if "kernel" in spec: if not (os.path.exists( "/tftpboot/pxelinux.cfg/build-%s" % spec["kernel"]) and os.path.exists( "/tftpboot/pxelinux.cfg/boot-%s" % spec["kernel"])): raise JobValidationError( ("Specified image %s (from -k " + "'kernel' flag does not exist") % spec["kernel"]) if "attrs" in spec: matched_res = self.resources.get_attr_matched_resources( [{ "name": "*", "functional": True, "scheduled": True, "attributes": "*" }], spec["attrs"]) if spec["nodecount"] > len(matched_res): raise JobValidationError("Not enough nodes exist with the " + "attributes to match") return spec validate_job = exposed(validate_job) def verify_locations(self, location_list): """Makes sure a 'location string' is valid""" resources = self.get_resources([{"name": r} for r in location_list]) return [r.name for r in resources] verify_locations = exposed(verify_locations) def find_job_location(self, job_location_args, end_times): """Finds and reserves a list of nodes in which the job can run Arguments: job_location_args -- A list of dictionaries with info about the job jobid -- string identifier nodes -- int number of nodes queue -- string queue name required -- ?? utility_score -- ?? threshold -- ?? walltime -- ?? attrs -- dictionary of attributes to match against end_times -- supposed time the job will end Returns: Dictionary with list of nodes a job can run on, keyed by jobid """ locations = {} def jobsort(job): """Used to sort job list by utility score""" return job["utility_score"] job_location_args.sort(key=jobsort) for job in job_location_args: specs = [{ "name": "*", "functional": True, "scheduled": True, "state": "idle", "attributes": "*" }] if "attrs" not in job or job["attrs"] is None: job["attrs"] = {} resources = self.resources.get_attr_matched_resources( specs, job["attrs"]) if len(resources) < job["nodes"]: #Can't schedule job - not enough resources continue def namesort(res): """Used to sort resources by name""" return res.name resources.sort(key=namesort) used_resources = resources[:job["nodes"]] for res in used_resources: res.state = "busy" locations[job["jobid"]] = [r.name for r in used_resources] return locations find_job_location = exposed(find_job_location) def find_queue_equivalence_classes(self, reservation_dict, active_queue_names): """Finds equivalent queues""" equiv = [] for queue in self.queue_assignments: # skip queues that aren't running if not queue in active_queue_names: continue found_a_match = False for equ in equiv: if equ['data'].intersection(self.queue_assignments[queue]): equ['queues'].add(queue) equ['data'].update(self.queue_assignments[queue]) found_a_match = True break if not found_a_match: equiv.append({ 'queues': set([queue]), 'data': set(self.queue_assignments[queue]), 'reservations': set() }) real_equiv = [] for eq_class in equiv: found_a_match = False for equ in real_equiv: if equ['queues'].intersection(eq_class['queues']): equ['queues'].update(eq_class['queues']) equ['data'].update(eq_class['data']) found_a_match = True break if not found_a_match: real_equiv.append(eq_class) equiv = real_equiv for eq_class in equiv: for res_name in reservation_dict: for host_name in reservation_dict[res_name].split(":"): if host_name in eq_class['data']: eq_class['reservations'].add(res_name) for key in eq_class: eq_class[key] = list(eq_class[key]) del eq_class['data'] return equiv find_queue_equivalence_classes = exposed(find_queue_equivalence_classes)
ComponentProxy(forker).cleanup_children(cleanup[forker]) except ComponentLookupError: self.logger.error("failed to contact the %s component to cleanup children", forker) except: self.logger.error("unexpected exception while requesting that the %s component perform cleanup", forker, exc_info=True) _get_exit_status = automatic(_get_exit_status, float(get_cluster_system_config('get_exit_status_interval', 10))) def wait_process_groups (self, specs): self._get_exit_status() process_groups = [pg for pg in self.process_groups.q_get(specs) if pg.exit_status is not None] for process_group in process_groups: self.clean_nodes(process_group.location, process_group.user, process_group.jobid) return process_groups wait_process_groups = locking(exposed(query(wait_process_groups))) def signal_process_groups (self, specs, signame="SIGINT"): my_process_groups = self.process_groups.q_get(specs) for pg in my_process_groups: if pg.exit_status is None: try: ComponentProxy(pg.forker).signal(pg.head_pid, signame) except: self.logger.error("Failed to communicate with forker when signalling job") return my_process_groups signal_process_groups = exposed(query(signal_process_groups)) def del_process_groups(self, jobid): '''delete a process group and don't track it anymore.
class HeckleSystem(Component): """ Cobalt System component for handling / interacting with Heckle resource manager External Methods: add_process_groups -- allocates nodes get_process_groups -- get process groups based on specs signal_process_groups -- signal a process group wait_process_groups -- removed process groups based on specs Internal Methods: __init__: _start_pg: _check_builds_done: _wait: _release_resources: get_resources: Queue Manager Methods: validate_job: verify_locations: find_job_locations: find_queue_equivalence_classes: """ name = "system" implementation = "HeckleBreadboard" queue_assignments = {} def __init__(self, *args, **kwargs): Component.__init__(self, *args, **kwargs) self.process_groups = ProcessGroupDict() self.process_groups.item_cls = HeckleProcessGroup self.queue_assignments["default"] = self.get_resources() self.hacky_forbidden_nodes = [ ] #This is a temporary fix for the forbidden nodes issue def __repr__(self): """ printout representation of the class """ indict = self.__dict__ printstr = "" printstr += "Heckle System Object: Values" for element in indict: printstr += str(element) + "::" if indict[element] == None: printstr += "None, " else: printstr += str(indict[element]) + ", " printstr += " Process Groups:" for element in self.process_groups: printstr += str(element) + "::" + \ str(self.process_groups[element]) + ", " return printstr ##################### # Main set of methods ##################### def add_process_groups(self, specs): """ This function takes the specs (a list of jobs) and initiates each job as a process group. The process group abstracts the actual job into an object, providing a single point of control and interaction for all the nodes within that job. Each job is described by a dict. Each dict contains: size: kernel: a String, the name of the kernel image to load. executable: A string, the name of the command to execute upon the head node; this could be considered the actual job's file. stdin, stdout, stderr: Three separate strings, each containing the file to use for standard communication with the job as it is running. May be specified, or False. kerneloptions: A string containing various options for the kernel, or False. args: A list umask: An integer jobid: An integer cobalt_log_file: A string containing the log file to use in the initiation and running of the job itself. location: List of strings of node / resource names env: A dict of key:value strings, specifying the environment in which the job is to run on the node id: A number mode: nodect: cwd: A string, specifying the current working directory in which to run the job on the node walltime: Integer; the time, in minutes, allocated for the job to run on the node. user: A string, the name of the user under which this job is to run. """ logstr = "System:add_process_groups:" LOGGER.debug(logstr + "Specs are %s" % specs) return self.process_groups.q_add(specs) add_process_groups = exposed(query(add_process_groups)) def get_process_groups(self, specs): """get a list of existing allocations""" LOGGER.debug("System:get_process_groups: specs are %s" % specs) self._wait() return self.process_groups.q_get(specs) get_process_groups = exposed(query(get_process_groups)) def signal_process_groups(self, specs, sig): """Free the specified process group (set of allocated nodes)""" LOGGER.debug( "System:signal_process_groups: Specs are %s, sig is %s"\ % (specs, sig) ) return self.process_groups.q_get(specs, lambda x, y: x.signal(y), sig) signal_process_groups = exposed(query(signal_process_groups)) def wait_process_groups(self, specs): """Remove terminated process groups""" LOGGER.debug("System:wait_process_groups; specs are %s" % specs) return self.process_groups.q_del(specs, lambda x, \ _:self._release_resources(x)) wait_process_groups = exposed(query(wait_process_groups)) ######################################### # Methods for dealing with Process Groups ######################################### def _check_builds_done(self): """ Check to see if the nodes are done building Starts the process group if all nodes in them are done building """ #LOGGER.debug( "System:Check Build Done: Waiting to Start..." ) #sleep(20) exstr = "System:check_build_done:" retval = True pg_list = [x for x in self.process_groups.itervalues()\ if (len(x.pinging_nodes) > 0)] hiccup = HeckleConnector() for pgp in pg_list: for nodename in pgp.pinging_nodes: teststr = hiccup.get_node_bootstate(nodename) if teststr == "READY": if 'fakebuild' in pgp.__dict__ and pgp.fakebuild: pgp.pinging_nodes.remove(nodename) LOGGER.debug( exstr + "Node %s done building; "\ + "%s pinging nodes left" %\ ( nodename, len(pgp.pinging_nodes)-1 ) ) else: LOGGER.debug( exstr + "Node %s not done yet" %\ nodename ) if teststr == "COMPLETED": LOGGER.debug( exstr + "Removing node %s...%i pinging nodes left" \ % (nodename, len(pgp.pinging_nodes)-1) ) pgp.pinging_nodes.remove(nodename) elif teststr in ["BOOTING", "", ""]: LOGGER.debug(exstr + "Node %s not done yet." % nodename) elif teststr == "UNALLOCATED": raise Exception( exstr + "Node 'UNALLOCATED'; Possible build error, or system timed out." ) elif teststr == "CRITFAIL": raise Exception( exstr + "Node says, 'CRITFAIL'. It timed out while building.") ##################### #### Need to figure a better way to fail gracefully ##################### if len(pgp.pinging_nodes) == 0: LOGGER.debug( "System:Check Build Done: No Pinging Nodes left, Start PG %s Running." \ % pgp.jobid) pgp.start() else: retval = False return retval _check_builds_done = automatic(_check_builds_done) def _wait(self): """ Calls the process group container's wait() method """ waitlen = len(self.process_groups.keys()) LOGGER.debug("System:_wait:%s process groups." % waitlen) for pgp in self.process_groups.itervalues(): pgp.wait() try: del (self.hacky_forbidden_nodes[pgp.location]) except: pass _wait = automatic(_wait) def _release_resources(self, pgp): """ Releases all the Heckle nodes, unreserving them """ LOGGER.debug("System:release") LOGGER.debug("System:Locations are: %s" % pgp.location) hiccup = HeckleConnector() hiccup.free_reserved_node(uid=pgp.uid, node_list=pgp.location) try: del (self.hacky_forbidden_nodes[pgp.location]) except: pass def get_resources(self, specs=None): """ Returns a list of free resources (nodes) which match the given specs. Specs is a dict which describes a job """ LOGGER.debug("System:get Resources") ################################## ### Look at this as a future change ################################## hiccup = HeckleConnector() if not specs: return hiccup.node_list else: return hiccup.list_available_nodes(**specs) get_resources = exposed(query(get_resources)) ########################################################## # Methods for interacting with scheduler and queue-manager ########################################################## def validate_job(self, spec): """ Validates a job for submission -- will the job ever run under the current Heckle configuration? Steps: 1) Validate Kernel 2) Validate HW 3) Validate Job versus overall """ LOGGER.debug("System:Validate Job: Specs are %s" % spec) hiccup = HeckleConnector() try: kernel = spec['kernel'] valid_kernel = hiccup.validkernel(kernel) if not valid_kernel: raise Exception("System:Validate Job: Bad Kernel") except: spec['kernel'] = 'default' try: valid_hw = hiccup.validhw(**spec['attrs']) if not valid_hw: raise Exception("System:Validate Job: Bad Hardware Specs: %s" % spec) except Exception as strec: raise Exception("System:Validate Job: Validate Job: %s" % strec) #try: #valid_job = hiccup.validjob( **spec ) #if not valid_job: #raise Exception( #"System: validate Job: Never enough nodes") #except: #raise Exception("System: validate Job: Never enough nodes") return spec validate_job = exposed(validate_job) def verify_locations(self, location_list): """ Makes sure a location list is valid location list is a list of fully qualified strings of node names ex: nodename.mcs.anl.gov """ LOGGER.debug("System:validate Job: Verify Locations") hiccup = HeckleConnector() heckle_set = set(hiccup.list_all_nodes()) location_set = set(location_list) if heckle_set >= location_set: return location_list else: not_valid_list = list(location_set.difference(heckle_set)) raise Exception( "System:VerifyLocations: Invalid location names: %s" % not_valid_list) verify_locations = exposed(verify_locations) def find_job_location(self, job_location_args, end_times): """ Finds a group of not-busy nodes in which to run the job Arguments: job_location_args -- A list of dictionaries with info about the job jobid -- string identifier nodes -- int number of nodes queue -- string queue name required -- ?? utility_score -- ?? threshold -- ?? walltime -- ?? attrs -- dictionary of attributes to match against end_times -- supposed time the job will end Returns: Dictionary with list of nodes a job can run on, keyed by jobid """ LOGGER.debug("System:find_job_location") locations = {} def jobsort(job): """Used to sort job list by utility score""" return job["utility_score"] job_location_args.sort(key=jobsort) #Try to match jobs to nodes which can run them hiccup = HeckleConnector() for job in job_location_args: if "attrs" not in job or job["attrs"] is None: job["attrs"] = {} print "Job is %s" % job tempjob = job.copy() if self.hacky_forbidden_nodes: if 'forbidden' not in tempjob.keys(): tempjob['forbidden'] = self.hacky_forbidden_nodes else: tempjob['forbidden'].extend(self.hacky_forbidden_nodes) ############################# ### Look at this as point of change ### Think: For node in unreserved nodes ### Choose node from list ### Remove node from unreserved nodes ############################# try: resources = hiccup.find_job_location(** job) #get matching nodes if not resources: continue except Exception as err: LOGGER.info("System:find_job_location: Error %s" % err) continue node_list = [] # Build a list of appropriate nodes for node in resources: node_list.append(node) self.hacky_forbidden_nodes.append(node) locations[job["jobid"]] = node_list LOGGER.info("System:find_job_location: locations are %s" % locations) return locations find_job_location = exposed(find_job_location) def find_queue_equivalence_classes(self, reservation_dict, \ active_queue_names): """ Finds equivalent queues An equivalent queue is a queue which can run upon the same partition(s) For now, with one partition (everything!) this is irrelevant. Returns: equiv= [{'reservations': [], 'queues': ['default']}] """ #LOGGER.debug("System:find queue equivalence classes" ) equiv = [] #print "Reservation_Dict is: %s" % reservation_dict #print "Active_queue_names is %s" % active_queue_names #print "Queue assignments are: %s" % self.queue_assignments for queue in self.queue_assignments: # skip queues that aren't running if not queue in active_queue_names: continue found_a_match = False #print "Heckle Queue is %s" % queue for equ in equiv: print "Heckle Equ is %s" % equ if equ['data'].intersection(self.queue_assignments[queue]): equ['queues'].add(queue) equ['data'].update(self.queue_assignments[queue]) found_a_match = True break if not found_a_match: equiv.append({ 'queues': set([queue]), 'data': set(self.queue_assignments[queue]), 'reservations': set() }) real_equiv = [] for eq_class in equiv: found_a_match = False for equ in real_equiv: if equ['queues'].intersection(eq_class['queues']): equ['queues'].update(eq_class['queues']) equ['data'].update(eq_class['data']) found_a_match = True break if not found_a_match: real_equiv.append(eq_class) equiv = real_equiv for eq_class in equiv: for res_name in reservation_dict: for host_name in reservation_dict[res_name].split(":"): if host_name in eq_class['data']: eq_class['reservations'].add(res_name) for key in eq_class: eq_class[key] = list(eq_class[key]) del eq_class['data'] return equiv find_queue_equivalence_classes = exposed(find_queue_equivalence_classes) def get_partitions(self, locations): """ Work-around to get the cqadm to run a single job on this system PRE: locations is a list of dict of strings of possible node names POST: if good, return locations if not good, raise exception and list bad nodes """ logstr = "System:get_partition: " hiccup = HeckleConnector() heckle_node_set = set(hiccup.list_all_nodes()) locs = locations[0]['name'] LOGGER.debug(logstr + "raw is are: %s" % locations) LOGGER.debug(logstr + "vals are: %s" % locs) if type(locs) == ListType: locset = set(locs) badlocations = locset.difference(heckle_node_set) if badlocations: raise Exception(logstr + "Bad Locations: %s " % list(badlocations)) elif type(locs) == StringType: if locs not in locations: raise Exception(logstr + "Bad Locations: %s" % locs) else: raise Exception( logstr + "location needs to be string or list of strings, you provided %s : %s" \ % ( type(locs), locs)) return locations get_partitions = exposed(get_partitions)
class ClusterSystem(ClusterBaseSystem): """cluster system component. Methods: configure -- load partitions from the bridge API add_process_groups -- add (start) an mpirun process on the system (exposed, ~query) get_process_groups -- retrieve mpirun processes (exposed, query) wait_process_groups -- get process groups that have exited, and remove them from the system (exposed, query) signal_process_groups -- send a signal to the head process of the specified process groups (exposed, query) update_partition_state -- update partition state from the bridge API (runs as a thread) """ name = "system" implementation = "cluster_system" logger = logger def __init__(self, *args, **kwargs): ClusterBaseSystem.__init__(self, *args, **kwargs) self.process_groups.item_cls = ClusterProcessGroup def __setstate__(self, state): ClusterBaseSystem.__setstate__(self, state) self.process_groups.item_cls = ClusterProcessGroup def add_process_groups(self, specs): """Create a process group. Arguments: spec -- dictionary hash specifying a process group to start """ self.logger.info("add_process_groups(%r)", specs) process_groups = self.process_groups.q_add(specs) for pgroup in process_groups: self.logger.info( "Job %s/%s: process group %s created to track script", pgroup.user, pgroup.jobid, pgroup.id) #System has started the job. We need remove them from the temp, alloc array #in cluster_base_system. for pg in process_groups: for location in pg.location: del self.alloc_only_nodes[location] return process_groups add_process_groups = exposed(query(add_process_groups)) def get_process_groups(self, specs): self._get_exit_status() return self.process_groups.q_get(specs) get_process_groups = exposed(query(get_process_groups)) def _get_exit_status(self): try: running = ComponentProxy("forker").active_list("process group") except: self.logger.error( "failed to contact forker component for list of running jobs") return for each in self.process_groups.itervalues(): if each.head_pid not in running and each.exit_status is None: # FIXME: i bet we should consider a retry thing here -- if we fail enough times, just # assume the process is dead? or maybe just say there's no exit code the first time it happens? # maybe the second choice is better try: dead_dict = ComponentProxy("forker").get_status( each.head_pid) except Queue.Empty: self.logger.error( "failed call for get_status from forker component for pg %s", each.head_pid) return if dead_dict is None: self.logger.info( "Job %s/%s: process group %i: exited with unknown status", each.user, each.jobid, each.id) each.exit_status = 1234567 else: each.exit_status = dead_dict["exit_status"] if dead_dict["signum"] == 0: self.logger.info( "process group %i: job %s/%s exited with status %i", each.id, each.jobid, each.user, each.exit_status) else: if dead_dict["core_dump"]: core_dump_str = ", core dumped" else: core_dump_str = "" self.logger.info( "process group %i: job %s/%s terminated with signal %s%s", each.id, each.jobid, each.user, dead_dict["signum"], core_dump_str) _get_exit_status = automatic(_get_exit_status) def wait_process_groups(self, specs): self._get_exit_status() process_groups = [ pg for pg in self.process_groups.q_get(specs) if pg.exit_status is not None ] for process_group in process_groups: self.clean_nodes( pg.location, pg.user, pg.jobid ) #FIXME: This call is a good place to look for problems return process_groups wait_process_groups = locking(exposed(query(wait_process_groups))) def signal_process_groups(self, specs, signame="SIGINT"): my_process_groups = self.process_groups.q_get(specs) for pg in my_process_groups: if pg.exit_status is None: try: ComponentProxy("forker").signal(pg.head_pid, signame) except: self.logger.error( "Failed to communicate with forker when signalling job" ) return my_process_groups signal_process_groups = exposed(query(signal_process_groups)) def del_process_groups(self, jobid): '''delete a process group and don't track it anymore. jobid -- jobid associated with the process group we are removing ''' del_items = self.process_groups.q_del([{'jobid': jobid}]) if del_items == []: self.logger.warning( "Job %s: Process group not found for this jobid.", jobid) else: self.logger.info("Job %s: Process group deleted.", jobid)
class ClusterQsim(ClusterBaseSystem): '''Cobalt Queue Simulator for cluster systems''' implementation = "cqsim" name = "cluster-queue-manager" alias = "cluster-system" logger = logging.getLogger(__name__) def __init__(self, *args, **kwargs): ClusterBaseSystem.__init__(self, *args, **kwargs) self.sleep_interval = kwargs.get("sleep_interval", 0) self.fraction = kwargs.get("cluster_fraction", 1) self.sim_start = kwargs.get("c_trace_start", 0) self.sim_end = kwargs.get("c_trace_end", sys.maxint) self.anchor = kwargs.get("anchor", 0) self.workload_file = kwargs.get("cjob") self.output_log = MACHINE_NAME + "-" + kwargs.get("outputlog", "") self.bgjob = kwargs.get("bgjob") self.event_manager = ComponentProxy("event-manager") walltime_prediction = get_histm_config("walltime_prediction", False) # *AdjEst* print "walltime_prediction=", walltime_prediction if walltime_prediction in ["True", "true"]: self.walltime_prediction = True else: self.walltime_prediction = False self.time_stamps = [('I', '0', 0, {})] self.cur_time_index = 0 self.queues = SimQueueDict(policy=None) # self.invisible_job_dict = {} # for jobs not submitted, {jobid:job_instance} self.unsubmitted_job_spec_dict = {} #{jobid: jobspec} self.num_running = 0 self.num_waiting = 0 self.num_busy = 0 self.num_end = 0 self.total_job = 0 self.total_nodes = len(self.all_nodes) self.init_queues() #initialize PBS-style logger self.pbslog = PBSlogger(self.output_log) #initialize debug logger if self.output_log: self.dbglog = PBSlogger(self.output_log+"-debug") else: self.dbglog = PBSlogger(".debug") #finish tag self.finished = False #register local alias "system" for this component local_components["cluster-system"] = self #initialize capacity loss self.capacity_loss = 0 #starting job(id)s at current time stamp. used for calculating capacity loss self.starting_jobs = [] self.user_utility_functions = {} self.builtin_utility_functions = {} self.define_builtin_utility_functions() self.define_user_utility_functions() self.cosched_scheme_tup = kwargs.get("coscheduling", (0,0)) self.cosched_scheme = self.cosched_scheme_tup[1] self.cosched_scheme_remote = self.cosched_scheme_tup[0] self.mate_vicinity = kwargs.get("vicinity", 0) self.mate_ratio = kwargs.get("mate_ratio", 0) valid_cosched_schemes = ["hold", "yield"] if self.cosched_scheme in valid_cosched_schemes and self.cosched_scheme_remote in valid_cosched_schemes: self.coscheduling = True else: self.coscheduling = False if not kwargs.get("bgjob", None): self.coscheduling = False self.mate_job_dict = {} if self.coscheduling: self.jobid_qtime_pairs = self.init_jobid_qtime_pairs() try: self.remote_jobid_qtime_pairs = ComponentProxy(REMOTE_QUEUE_MANAGER).get_jobid_qtime_pairs() except: self.logger.error("fail to connect to remote queue-manager component!") self.coscheduling = False if self.mate_vicinity: print "start init mate job dict, vicinity=", self.mate_vicinity self.init_mate_job_dict_by_vicinity() elif self.mate_ratio: print "start init mate job dict, mate_ratio=", self.mate_ratio self.init_mate_job_dict_by_ratio(self.mate_ratio) else: self.logger.error("fail to initialize mate job dict!") matejobs = len(self.mate_job_dict.keys()) proportion = float(matejobs) / self.total_job #recording holding job id and holden resource self.job_hold_dict = {} #record holding job's holding time jobid:first hold (sec) self.first_hold_time_dict = {} #record yield jobs's first yielding time, for calculating the extra waiting time self.first_yield_hold_time_dict = {} #record yield job ids. update dynamically self.yielding_job_list = [] if self.coscheduling: remote_mate_job_dict = dict((v,k) for k, v in self.mate_job_dict.iteritems()) try: ComponentProxy(REMOTE_QUEUE_MANAGER).set_mate_job_dict(remote_mate_job_dict) except: self.logger.error("failed to connect to remote queue-manager component!") self.coscheduling = False print "number of mate job pairs: %s, proportion in cluster jobs: %s%%" \ % (len(self.mate_job_dict.keys()), round(proportion *100, 1) ) self.max_holding_sys_util = DEFAULT_MAX_HOLDING_SYS_UTIL def get_current_time(self): '''this function overrid the get_current_time in bgsched, bg_base_system, and cluster_base_system''' return self.event_manager.get_current_time() def get_current_time_sec(self): return self.event_manager.get_current_time() def get_current_time_date(self): return self.event_manager.get_current_date_time() def insert_time_stamp(self, timestamp, type, info): '''insert time stamps in the same order''' if type not in SET_event: print "invalid event type,", type return evspec = {} evspec['jobid'] = info.get('jobid', 0) evspec['type'] = type evspec['datetime'] = sec_to_date(timestamp) evspec['unixtime'] = timestamp evspec['machine'] = MACHINE_ID self.event_manager.add_event(evspec) def _get_queuing_jobs(self): return [job for job in self.queues.get_jobs([{'is_runnable':True}])] queuing_jobs = property(_get_queuing_jobs) def _get_running_jobs(self): return [job for job in self.queues.get_jobs([{'has_resources':True}])] running_jobs = property(_get_running_jobs) def add_queues(self, specs): '''add queues''' return self.queues.add_queues(specs) add_queues = exposed(query(add_queues)) def get_queues(self, specs): '''get queues''' return self.queues.get_queues(specs) get_queues = exposed(query(get_queues)) def init_queues(self): '''parses the work load log file, initializes queues and sorted time stamp list''' print "Initializing cluster jobs, one moment please..." raw_jobs = parse_work_load(self.workload_file) specs = [] tag = 0 for key in raw_jobs: spec = {} tmp = raw_jobs[key] spec['jobid'] = tmp.get('jobid') spec['queue'] = tmp.get('queue') #convert submittime from "%m/%d/%Y %H:%M:%S" to Unix time sec format_sub_time = tmp.get('submittime') if format_sub_time: qtime = date_to_sec(format_sub_time) if qtime < self.sim_start or qtime > self.sim_end: continue spec['submittime'] = qtime #spec['submittime'] = float(tmp.get('qtime')) spec['first_subtime'] = spec['submittime'] #set the first submit time else: continue spec['user'] = tmp.get('user') spec['project'] = tmp.get('account') #convert walltime from 'hh:mm:ss' to float of minutes format_walltime = tmp.get('Resource_List.walltime') spec['walltime'] = 0 if format_walltime: parts = format_walltime.split(',') days = 0 if len(parts) > 1: #contain day: 1 day, 11:00:00 days = int(parts[0].split(' ')[0]) minutes_part = parts[1] else: minutes_part = parts[0] segs = minutes_part.split(':') walltime_minutes = int(segs[0])*60 + int(segs[1]) total_walltime_minutes = walltime_minutes + days * 24 * 60 spec['walltime'] = str(total_walltime_minutes) else: #invalid job entry, discard continue if tmp.get('start') and tmp.get('end'): act_run_time = float(tmp.get('end')) - float(tmp.get('start')) if act_run_time / (float(spec['walltime'])*60) > 1.1: act_run_time = float(spec['walltime'])*60 spec['runtime'] = str(round(act_run_time, 1)) else: continue if tmp.get('Resource_List.nodect'): spec['nodes'] = tmp.get('Resource_List.nodect') if int(spec['nodes']) == 40960: continue else: #invalid job entry, discard continue if self.walltime_prediction: #*AdjEst* ap = self.get_walltime_Ap(spec) spec['walltime_p'] = float(spec['walltime']) * ap else: spec['walltime_p'] = float(spec['walltime']) spec['state'] = 'invisible' spec['start_time'] = '0' spec['end_time'] = '0' spec['queue'] = "default" spec['has_resources'] = False spec['is_runnable'] = False #add the job spec to the spec list specs.append(spec) specs.sort(subtimecmp) #adjust workload density and simulation start time if self.fraction != 1 or self.anchor !=0 : tune_workload(specs, self.fraction, self.anchor) print "workload adjusted: " print "first job submitted:", sec_to_date(specs[0].get('submittime')) print "last job submitted:", sec_to_date(specs[len(specs)-1].get('submittime')) self.total_job = len(specs) print "total job number:", self.total_job #self.add_jobs(specs) self.unsubmitted_job_spec_dict = self.init_unsubmitted_dict(specs) self.event_manager.add_init_events(specs, MACHINE_ID) return 0 def init_unsubmitted_dict(self, specs): #jobdict = {} specdict = {} for spec in specs: jobid = str(spec['jobid']) #new_job = Job(spec) #jobdict[jobid] = new_job specdict[jobid] = spec return specdict def get_walltime_Ap(self, spec): #*AdjEst* '''get walltime adjusting parameter from history manager component''' projectname = spec.get('project') username = spec.get('user') if prediction_scheme == "paired": return self.history_manager.get_Ap_by_keypair(username, projectname) Ap_proj = self.history_manager.get_Ap('project', projectname) Ap_user = self.history_manager.get_Ap('user', username) if prediction_scheme == "project": return Ap_proj elif prediction_scheme == "user": print "Ap_user==========", Ap_user return Ap_user elif prediction_scheme == "combined": return (Ap_proj + Ap_user) / 2 else: return self.history_manager.get_Ap_by_keypair(username, projectname) def log_job_event(self, eventtype, timestamp, spec): '''log job events(Queue,Start,End) to PBS-style log''' def len2 (_input): _input = str(_input) if len(_input) == 1: return "0" + _input else: return _input if eventtype == 'Q': #submitted(queued) for the first time message = "%s;Q;%s;queue=%s" % (timestamp, spec['jobid'], spec['queue']) elif eventtype == 'R': #resume running after failure recovery message = "%s;R;%s" % (timestamp, ":".join(spec['location'])) else: wall_time = spec['walltime'] walltime_minutes = len2(int(float(wall_time)) % 60) walltime_hours = len2(int(float(wall_time)) // 60) log_walltime = "%s:%s:00" % (walltime_hours, walltime_minutes) if eventtype == 'S': #start running message = "%s;S;%s;queue=%s qtime=%s Resource_List.nodect=%s Resource_List.walltime=%s start=%s exec_host=%s" % \ (timestamp, spec['jobid'], spec['queue'], spec['submittime'], spec['nodes'], log_walltime, spec['start_time'], ":".join(spec['location'])) elif eventtype == 'H': #hold some resource message = "%s;H;%s;queue=%s qtime=%s Resource_List.nodect=%s Resource_List.walltime=%s exec_host=%s" % \ (timestamp, spec['jobid'], spec['queue'], spec['submittime'], spec['nodes'], log_walltime, ":".join(spec['location'])) elif eventtype == "U": #unhold some resources message = "%s;U;%s;host=%s" % \ (timestamp, spec['jobid'], ":".join(spec['location'])) elif eventtype == 'E': #end first_yield_hold = self.first_yield_hold_time_dict.get(int(spec['jobid']), 0) if first_yield_hold > 0: overhead = spec['start_time'] - first_yield_hold else: overhead = 0 message = "%s;E;%s;queue=%s qtime=%s Resource_List.nodect=%s Resource_List.walltime=%s start=%s end=%f exec_host=%s runtime=%s hold=%s overhead=%s" % \ (timestamp, spec['jobid'], spec['queue'], spec['submittime'], spec['nodes'], log_walltime, spec['start_time'], round(float(spec['end_time']), 1), ":".join(spec['location']), spec['runtime'], spec['hold_time'], overhead) else: print "---invalid event type, type=", eventtype return self.pbslog.LogMessage(message) def get_live_job_by_id(self, jobid): '''get waiting or running job instance by jobid''' job = None joblist = self.queues.get_jobs([{'jobid':int(jobid)}]) if joblist: job = joblist[0] return job def get_jobs(self, specs): '''get a list of jobs, each time triggers time stamp increment and job states update''' jobs = [] if self.event_manager.get_go_next(): del self.yielding_job_list[:] cur_event = self.event_manager.get_current_event_type() if cur_event in ["Q", "E"]: self.update_job_states(specs, {}, cur_event) self.compute_utility_scores() #unhold holding job. MUST be after compute_utility_scores() if cur_event == "U": cur_job = self.event_manager.get_current_event_job() if cur_job in self.job_hold_dict.keys(): self.unhold_job(cur_job) else: #if the job not in job_hold_dict, do nothing. the job should have already started return [] if cur_event == "C": if self.job_hold_dict.keys(): self.unhold_all() self.event_manager.set_go_next(True) jobs = self.queues.get_jobs([{'tag':"job"}]) if self.yielding_job_list: jobs = [job for job in jobs if job.jobid not in self.yielding_job_list] return jobs get_jobs = exposed(query(get_jobs)) def update_job_states(self, specs, updates, cur_event): '''update the state of the jobs associated to the current time stamp''' ids_str = str(self.event_manager.get_current_event_job()) ids = ids_str.split(':') #print "current event=", cur_event, " ", ids for Id in ids: if cur_event == "Q": # Job (Id) is submitted tempspec = self.unsubmitted_job_spec_dict[Id] tempspec['state'] = "queued" #invisible -> queued tempspec['is_runnable'] = True #False -> True self.queues.add_jobs([tempspec]) self.num_waiting += 1 self.log_job_event("Q", self.get_current_time_date(), tempspec) del self.unsubmitted_job_spec_dict[Id] elif cur_event=="E": # Job (Id) is completed joblist = self.queues.get_jobs([{'jobid':int(Id)}]) if joblist: completed_job = joblist[0] else: return 0 #log the job end event jobspec = completed_job.to_rx() #print "end jobspec=", jobspec if jobspec['end_time']: end = float(jobspec['end_time']) else: end = 0 end_datetime = sec_to_date(end) self.log_job_event("E", end_datetime, jobspec) #free nodes self.nodes_up(completed_job.location) self.num_busy -= len(completed_job.location) #delete the job instance from self.queues self.queues.del_jobs([{'jobid':int(Id)}]) self.num_running -= 1 self.num_end += 1 return 0 def run_job_updates(self, jobspec, newattr): ''' return the state updates (including state queued -> running, setting the start_time, end_time)''' updates = {} #print "enter run_job_updates, jobspec=", jobspec start = self.get_current_time_sec() updates['start_time'] = start updates['starttime'] = start updates['state'] = 'running' updates['system_state'] = 'running' updates['is_runnable'] = False updates['has_resources'] = True if jobspec['last_hold'] > 0: updates['hold_time'] = jobspec['hold_time'] + self.get_current_time_sec() - jobspec['last_hold'] #print self.get_current_time_date(), "run job state change, job", jobspec['jobid'], \ # ":", jobspec['state'], "->", updates['state'] #determine whether the job is going to fail before completion location = newattr['location'] duration = jobspec['remain_time'] end = start + duration updates['end_time'] = end self.insert_time_stamp(end, "E", {'jobid':jobspec['jobid']}) updates.update(newattr) return updates def add_jobs(self, specs): '''Add a job''' response = self.queues.add_jobs(specs) return response add_jobs = exposed(query(add_jobs)) def run_jobs(self, specs, nodelist): '''run a queued job, by updating the job state, start_time and end_time, invoked by bgsched''' #print "run job ", specs, " on nodes", nodelist if specs == None: return 0 for spec in specs: action = "start" dbgmsg = "" if self.coscheduling: local_job_id = spec.get('jobid') #int #check whether there is a mate job mate_job_id = self.mate_job_dict.get(local_job_id, 0) #if mate job exists, get the status of the mate job if mate_job_id > 0: remote_status = self.get_mate_jobs_status_local(mate_job_id).get('status', "unknown") dbgmsg1 = "local=%s;mate=%s;mate_status=%s" % (local_job_id, mate_job_id, remote_status) self.dbglog.LogMessage(dbgmsg1) if remote_status in ["queuing", "unsubmitted"]: if self.cosched_scheme == "hold": # hold resource if mate cannot run, favoring job action = "start_both_or_hold" if self.cosched_scheme == "yield": # give up if mate cannot run, favoring sys utilization action = "start_both_or_yield" if remote_status == "holding": action = "start_both" #self.dbglog.LogMessage(dbgmsg) #to be inserted co-scheduling handling code else: pass if action == "start": #print "CQSIM-normal: start job %s on nodes %s" % (spec['jobid'], nodelist) self.start_job([spec], {'location': nodelist}) elif action == "start_both_or_hold": #print "try to hold job %s on location %s" % (local_job_id, nodelist) mate_job_can_run = False #try to invoke a scheduling iteration to see if remote yielding job can run now try: mate_job_can_run = ComponentProxy(REMOTE_QUEUE_MANAGER).try_to_run_mate_job(mate_job_id) except: self.logger.error("failed to connect to remote queue-manager component!") if mate_job_can_run: #now that mate has been started, start local job self.start_job([spec], {'location': nodelist}) dbgmsg += " ###start both" else: self.hold_job(spec, {'location': nodelist}) elif action == "start_both": #print "start both mated jobs %s and %s" % (local_job_id, mate_job_id) self.start_job([spec], {'location': nodelist}) ComponentProxy(REMOTE_QUEUE_MANAGER).run_holding_job([{'jobid':mate_job_id}]) elif action == "start_both_or_yield": mate_job_can_run = False #try to invoke a scheduling iteration to see if remote yielding job can run now try: mate_job_can_run = ComponentProxy(REMOTE_QUEUE_MANAGER).try_to_run_mate_job(mate_job_id) except: self.logger.error("failed to connect to remote queue-manager component!") if mate_job_can_run: #now that mate has been started, start local job self.start_job([spec], {'location': nodelist}) dbgmsg += " ###start both" else: #mate job cannot run, give up the turn. mark the job as yielding. job_id = spec.get('jobid') self.yielding_job_list.append(job_id) #int #record the first time this job yields if not self.first_yield_hold_time_dict.has_key(job_id): self.first_yield_hold_time_dict[job_id] = self.get_current_time_sec() self.dbglog.LogMessage("%s: job %s first yield" % (self.get_current_time_date(), job_id)) #set tag false, enable scheduling another job at the same time self.event_manager.set_go_next(False) #self.print_screen() return len(specs) run_jobs = exposed(run_jobs) def start_job(self, specs, updates): '''update the job state and start_time and end_time when cqadm --run is issued to a group of jobs''' nodelist = updates['location'] self.nodes_down(nodelist) self.num_busy += len(nodelist) self.num_running += 1 self.num_waiting -= 1 def _start_job(job, newattr): '''callback function to update job start/end time''' temp = job.to_rx() newattr = self.run_job_updates(temp, newattr) temp.update(newattr) job.update(newattr) self.log_job_event('S', self.get_current_time_date(), temp) return self.queues.get_jobs(specs, _start_job, updates) def find_job_location(self, arg_list, end_times): best_location_dict = {} winner = arg_list[0] # first time through, try for starting jobs based on utility scores for args in arg_list: location_data = self._find_job_location(args) if location_data: best_location_dict.update(location_data) break # the next time through, try to backfill, but only if we couldn't find anything to start if not best_location_dict: job_end_times = {} total = 0 for item in sorted(end_times, cmp=self._backfill_cmp): total += len(item[0]) job_end_times[total] = item[1] needed = int(winner['nodes']) - len(self._get_available_nodes(winner)) now = self.get_current_time() ##different from super function backfill_cutoff = 0 for num in sorted(job_end_times): if needed <= num: backfill_cutoff = job_end_times[num] - now for args in arg_list: if 60*float(args['walltime']) > backfill_cutoff: continue location_data = self._find_job_location(args) if location_data: best_location_dict.update(location_data) self.logger.info("backfilling job %s" % args['jobid']) break #!!!following two lines must be commented for coscheduling feature because giving up may occur. when # a job is found location but give up to run, the nodes can't be updated to running status. # reserve the stuff in the best_partition_dict, as those partitions are allegedly going to # be running jobs very soon # for location_list in best_location_dict.itervalues(): # self.running_nodes.update(location_list) return best_location_dict find_job_location = exposed(find_job_location) # order the jobs with biggest utility first def utilitycmp(self, job1, job2): return -cmp(job1.score, job2.score) def compute_utility_scores (self): utility_scores = [] current_time = time.time() for job in self.queues.get_jobs([{'is_runnable':True}]): utility_name = self.queues[job.queue].policy args = {'queued_time':current_time - float(job.submittime), 'wall_time': 60*float(job.walltime_p), # *AdjEst* 'size': float(job.nodes), 'user_name': job.user, 'project': job.project, 'queue_priority': int(self.queues[job.queue].priority), #'machine_size': max_nodes, 'jobid': int(job.jobid), 'score': job.score, 'recovering': job.recovering, 'state': job.state, } try: if utility_name in self.builtin_utility_functions: utility_func = self.builtin_utility_functions[utility_name] else: utility_func = self.user_utility_functions[utility_name] utility_func.func_globals.update(args) score = utility_func() except KeyError: # do something sensible when the requested utility function doesn't exist # probably go back to the "default" one # and if we get here, try to fix it and throw away this scheduling iteration self.logger.error("cannot find utility function '%s' named by queue '%s'" % (utility_name, job.queue)) self.user_utility_functions[utility_name] = self.builtin_utility_functions["default"] self.logger.error("falling back to 'default' policy to replace '%s'" % utility_name) return except: # do something sensible when the requested utility function explodes # probably go back to the "default" one # and if we get here, try to fix it and throw away this scheduling iteration self.logger.error("error while executing utility function '%s' named by queue '%s'" % (utility_name, job.queue), \ exc_info=True) self.user_utility_functions[utility_name] = self.builtin_utility_functions["default"] self.logger.error("falling back to 'default' policy to replace '%s'" % utility_name) return try: job.score += score except: self.logger.error("utility function '%s' named by queue '%s' returned a non-number" % (utility_name, job.queue), \ exc_info=True) self.user_utility_functions[utility_name] = self.builtin_utility_functions["default"] self.logger.error("falling back to 'default' policy to replace '%s'" % utility_name) return def define_user_utility_functions(self): self.logger.info("building user utility functions") self.user_utility_functions.clear() filename = os.path.expandvars(get_bgsched_config("utility_file", "")) try: f = open(filename) except: #self.logger.error("Can't read utility function definitions from file %s" % get_bgsched_config("utility_file", "")) return str = f.read() try: code = compile(str, filename, 'exec') except: self.logger.error("Problem compiling utility function definitions.", exc_info=True) return globals = {'math':math, 'time':time} locals = {} try: exec code in globals, locals except: self.logger.error("Problem executing utility function definitions.", exc_info=True) for thing in locals.values(): if type(thing) is types.FunctionType: if thing.func_name in self.builtin_utility_functions: self.logger.error("Attempting to overwrite builtin utility function '%s'. User version discarded." % \ thing.func_name) else: self.user_utility_functions[thing.func_name] = thing define_user_utility_functions = exposed(define_user_utility_functions) def define_builtin_utility_functions(self): self.logger.info("building builtin utility functions") self.builtin_utility_functions.clear() # I think this duplicates cobalt's old scheduling policy # higher queue priorities win, with jobid being the tie breaker def default0(): val = queue_priority + 0.1 return val def default1(): '''FCFS''' val = queued_time return val def default(): '''WFP, supporting coordinated job recovery''' wall_time_sec = wall_time*60 val = ( queued_time / wall_time_sec)**3 * (size/64.0) return val def high_prio(): val = 1.0 return val self.builtin_utility_functions["default"] = default self.builtin_utility_functions["high_prio"] = high_prio #####coscheduling stuff def init_jobid_qtime_pairs(self): '''initialize mate job dict''' jobid_qtime_pairs = [] for id, spec in self.unsubmitted_job_spec_dict.iteritems(): qtime = spec['submittime'] jobid_qtime_pairs.append((qtime, int(id))) def _qtimecmp(tup1, tup2): return cmp(tup1[0], tup2[0]) jobid_qtime_pairs.sort(_qtimecmp) return jobid_qtime_pairs def find_mate_id(self, qtime, threshold): mate_subtime = 0 ret_id = 0 last = (0,0) for pair in self.remote_jobid_qtime_pairs: if pair[0] > qtime: break last = pair mate_subtime = last[0] mate_id = last[1] if mate_subtime > 0: if float(qtime) - mate_subtime < threshold: ret_id = mate_id return ret_id def init_mate_job_dict_by_vicinity(self): '''init mate job dictionary by vicinity''' temp_dict = {} #remote_id:local_id for id, spec in self.unsubmitted_job_spec_dict.iteritems(): id = int(id) submit_time = spec.get('submittime') mate_id = self.find_mate_id(submit_time, self.mate_vicinity) if mate_id > 0: #self.mate_job_dict[spec['jobid']] = int(mateid) if temp_dict.has_key(mate_id): tmp = temp_dict[mate_id] if id > tmp: temp_dict[mate_id] = id else: temp_dict[mate_id] = id #reserve dict to local_id:remote_id. (guarentee one-to-one) self.mate_job_dict = dict((local_id, remote_id) for remote_id, local_id in temp_dict.iteritems()) def init_mate_job_dict_by_ratio(self, ratio): '''init mate job dictionary by specified ratio''' if ratio <= 0.5: step = int(1.0 / ratio) reverse_step = 1 else: step = 1 reverse_step = int(1.0/(1-ratio)) print "step=", step print "reverse_step=", reverse_step i = 0 temp_dict = {} for item in self.jobid_qtime_pairs: remote_item = self.remote_jobid_qtime_pairs[i] random_number = random.random() if step > 1: if i % step == 0: temp_dict[item[1]] = remote_item[1] if reverse_step > 1: if i % reverse_step != 0: temp_dict[item[1]] = remote_item[1] i += 1 self.mate_job_dict = temp_dict def get_mate_job_dict(self): return self.mate_job_dict get_mate_job_dict = exposed(get_mate_job_dict) def hold_job(self, spec, updates): '''hold a job. a holding job is not started but hold some resources that can run itself in the future once its mate job in a remote system can be started immediatly. Note, one time hold only one job''' def _hold_job(job, newattr): '''callback function to update job start/end time''' temp = job.to_rx() newattr = self.hold_job_updates(temp, newattr) temp.update(newattr) job.update(newattr) self.log_job_event('H', self.get_current_time_date(), temp) current_holden_nodes = 0 for nodelist in self.job_hold_dict.values(): current_holden_nodes += len(nodelist) nodelist = updates['location'] job_id = spec['jobid'] if current_holden_nodes + len(nodelist) < self.max_holding_sys_util * self.total_nodes: self.job_hold_dict[job_id] = nodelist if not self.first_hold_time_dict.has_key(job_id): self.first_hold_time_dict[job_id] = self.get_current_time_sec() self.nodes_down(nodelist) if not self.first_yield_hold_time_dict.has_key(job_id): self.first_yield_hold_time_dict[job_id] = self.get_current_time_sec() return self.queues.get_jobs([spec], _hold_job, updates) else: #if execeeding the maximum limite of holding nodes, the job will not hold but yield self.yielding_job_list.append(job_id) #int #record the first time this job yields if not self.first_yield_hold_time_dict.has_key(job_id): self.first_yield_hold_time_dict[job_id] = self.get_current_time_sec() self.dbglog.LogMessage("%s: job %s first yield" % (self.get_current_time_date(), job_id)) return 0 def hold_job_updates(self, jobspec, newattr): ''' return the state updates (including state queued -> running, setting the start_time, end_time)''' updates = {} updates['is_runnable'] = False updates['has_resources'] = False updates['state'] = "holding" updates['last_hold'] = self.get_current_time_sec() updates.update(newattr) if SELF_UNHOLD_INTERVAL > 0: release_time = self.get_current_time_sec() + SELF_UNHOLD_INTERVAL self.insert_time_stamp(release_time, "U", {'jobid':jobspec['jobid'], 'location':newattr['location']}) return updates def unhold_job(self, jobid): '''if a job holds a partition longer than MAX_HOLD threshold, the job will release the partition and starts yielding''' nodelist = self.job_hold_dict.get(jobid) #release holden partitions if nodelist: self.nodes_up(nodelist) else: print "holding job %s not found in job_hold_dict: " % jobid def _unholding_job(job, newattr): '''callback function''' temp = job.to_rx() newattr = self.unholding_job_updates(temp, newattr) temp.update(newattr) job.update(newattr) self.log_job_event("U", self.get_current_time_date(), temp) del self.job_hold_dict[jobid] return self.queues.get_jobs([{'jobid':jobid}], _unholding_job, {'location':self.job_hold_dict.get(jobid, ["N"])}) def unholding_job_updates(self, jobspec, newattr): '''unhold job''' updates = {} updates['is_runnable'] = True updates['has_resources'] = False updates['state'] = "queued" #set the job to lowest priority at this scheduling point. #if no other job gets the nodes it released, the unholden job can hold those nodes again updates['score'] = 0 updates['hold_time'] = jobspec['hold_time'] + self.get_current_time_sec() - jobspec['last_hold'] updates['last_hold'] = 0 updates.update(newattr) return updates def unhold_all(self): '''unhold all jobs. periodically invoked to prevent deadlock''' for jobid in self.job_hold_dict.keys(): job_hold_time = self.get_current_time_sec() - self.first_hold_time_dict[jobid] #if a job has holden at least 10 minutes, then periodically unhold it if job_hold_time > AT_LEAST_HOLD: self.unhold_job(jobid) def try_to_run_mate_job(self, _jobid): '''try to run mate job, start all the jobs that can run. If the started jobs include the given mate job, return True else return False. _jobid : int ''' #if the job is not yielding, do not continue; no other job is possibly to be scheduled if _jobid not in self.yielding_job_list: return False mate_job_started = False #start all the jobs that can run while True: running_jobs = [job for job in self.queues.get_jobs([{'has_resources':True}])] end_times = [] now = self.get_current_time_sec() for job in running_jobs: end_time = max(float(job.starttime) + 60 * float(job.walltime), now + 5*60) end_times.append([job.location, end_time]) active_jobs = [job for job in self.queues.get_jobs([{'is_runnable':True}])] #waiting jobs active_jobs.sort(self.utilitycmp) job_location_args = [] for job in active_jobs: if not job.jobid == _jobid and self.mate_job_dict.get(job.jobid, 0) > 0: #if a job other than given job (_jobid) has mate, skip it. continue job_location_args.append({'jobid': str(job.jobid), 'nodes': job.nodes, 'queue': job.queue, 'forbidden': [], 'utility_score': job.score, 'walltime': job.walltime, 'walltime_p': job.walltime_p, #*AdjEst* 'attrs': job.attrs, } ) if len(job_location_args) == 0: break #print "queue order=", [item['jobid'] for item in job_location_args] best_partition_dict = self.find_job_location(job_location_args, end_times) if best_partition_dict: #print "best_partition_dict=", best_partition_dict for canrun_jobid in best_partition_dict: nodelist = best_partition_dict[canrun_jobid] if str(_jobid) == canrun_jobid: mate_job_started = True self.start_job([{'tag':"job", 'jobid':int(canrun_jobid)}], {'location':nodelist}) #print "bqsim.try_to_run_mate, start job jobid ", canrun_jobid else: break return mate_job_started try_to_run_mate_job = exposed(try_to_run_mate_job) def run_holding_job(self, specs): '''start holding job''' for spec in specs: jobid = spec.get('jobid') nodelist = self.job_hold_dict.get(jobid, None) if nodelist == None: #print "cannot find holding resources" return #print "start holding job %s on location %s" % (spec['jobid'], nodelist) self.start_job([spec], {'location':nodelist}) del self.job_hold_dict[jobid] run_holding_job = exposed(run_holding_job) #coscheduling stuff def get_mate_job_status(self, jobid): '''return mate job status, remote function, invoked by remote component''' ret_dict = {'jobid':jobid} ret_dict['status'] = self.get_coschedule_status(jobid) return ret_dict get_mate_job_status = exposed(get_mate_job_status) def get_mate_jobs_status_local(self, remote_jobid): '''return mate job status, invoked by local functions''' status_dict = {} try: status_dict = ComponentProxy(REMOTE_QUEUE_MANAGER).get_mate_job_status(remote_jobid) except: self.logger.error("failed to connect to remote queue-manager component!") status_dict = {'status':'notconnected'} self.dbglog.LogMessage("failed to connect to remote queue-manager component!") return status_dict def test_can_run(self, jobid): '''test whether a job can start immediately, specifically in following cases: 1. highest utility score and resource is available 2. not with top priority but can start in non-drained partition when top-priority job is draining 3. can be backfilled ''' return False def get_coschedule_status(self, jobid): '''return job status regarding coscheduling, input: jobid output: listed as follows: 1. "queuing" 2. "holding" 3. "unsubmitted" 4. "running" 5. "ended" ''' ret_status = "unknown" job = self.get_live_job_by_id(jobid) if job: #queuing or running has_resources = job.has_resources is_runnable = job.is_runnable if is_runnable and not has_resources: ret_status = "queuing" if not is_runnable and has_resources: ret_status = "running" if not is_runnable and not has_resources: ret_status = "holding" else: #unsubmitted or ended if self.unsubmitted_job_spec_dict.has_key(str(jobid)): ret_status = "unsubmitted" else: ret_status = "unknown" #ended or no such job del self.mate_job_dict[jobid] return ret_status #display stuff def print_screen(self, cur_event=""): '''print screen, show number of waiting jobs, running jobs, busy_nodes%''' print "Cluster" current_datetime = self.event_manager.get_current_date_time() print "%s %s" % (current_datetime, cur_event) print "number of waiting jobs: ", self.num_waiting waiting_job_bar = REDS for i in range(self.num_waiting): waiting_job_bar += "*" waiting_job_bar += ENDC print waiting_job_bar holding_jobs = len(self.job_hold_dict.keys()) holden_nodes = 0 for nodelist in self.job_hold_dict.values(): nodes = len(nodelist) holden_nodes += nodes print "number of running jobs: ", self.num_running running_job_bar = BLUES for i in range(self.num_running): running_job_bar += "+" running_job_bar += ENDC print running_job_bar print "number of holding jobs: ", holding_jobs print "number of holden nodes: ", holden_nodes print "number of busy nodes: ", self.num_busy print "system utilization: ", float(self.num_busy) / self.total_nodes busy_node_bar = GREENS i = 0 while i < self.num_busy: busy_node_bar += "x" i += 1 j = 0 busy_node_bar += ENDC busy_node_bar += YELLOWS while j < holden_nodes: busy_node_bar += '+' j += 1 i += 1 busy_node_bar += ENDC for k in range(i, self.total_nodes): busy_node_bar += "-" busy_node_bar += REDS busy_node_bar += "|" busy_node_bar += ENDC print busy_node_bar print "completed jobs/total jobs: %s/%s" % (self.num_end, self.total_job) progress = 100 * self.num_end / self.total_job progress_bar = "" i = 0 while i < progress: progress_bar += "=" i += 1 for j in range(i, 100): progress_bar += "-" progress_bar += "|" print progress_bar #print "waiting jobs: ", [(job.jobid, job.nodes) for job in self.queues.get_jobs([{'is_runnable':True}])] #print "holding jobs: ", self.job_hold_dict.keys() if self.sleep_interval: time.sleep(self.sleep_interval) def post_simulation_handling(self): '''post screen after simulation completes''' #print self.first_yield_hold_time_dict pass post_simulation_handling = exposed(post_simulation_handling)
class OrcmSystem (OrcmBaseSystem): """ORCM system component. Methods: add_process_groups -- add (start) an mpirun process on the system (exposed, ~query) get_process_groups -- retrieve mpirun processes (exposed, query) wait_process_groups -- get process groups that have exited, and remove them from the system (exposed, query) signal_process_groups -- send a signal to the head process of the specified process groups (exposed, query) update_partition_state -- update partition state from the bridge API (runs as a thread) """ name = "system" implementation = "orcm_system" logger = logger def __init__ (self, *args, **kwargs): OrcmBaseSystem.__init__(self, *args, **kwargs) self.process_groups.item_cls = OrcmProcessGroup def __del__ (self): OrcmBaseSystem.__del__(self) def __getstate__(self): state = {} state.update(OrcmBaseSystem.__getstate__(self)) # state.update({ # "orcm_system_version": 1 }) return state def __setstate__(self, state): OrcmBaseSystem.__setstate__(self, state) self.process_groups.item_cls = OrcmProcessGroup def add_process_groups (self, specs): """Create a process group. Arguments: spec -- dictionary hash specifying a process group to start """ self.logger.info("add_process_groups(%r)", specs) process_groups = self.process_groups.q_add(specs) for pgroup in process_groups: self.logger.info("Job %s/%s: process group %s created to track script", pgroup.user, pgroup.jobid, pgroup.id) #System has started the job. We need remove them from the temp, alloc array #in orcm_base_system. self.apg_started = True for pgroup in process_groups: for location in pgroup.location: try: del self.alloc_only_nodes[location] except KeyError: logger.critical("%s already removed from alloc_only_nodes list", location) return process_groups add_process_groups = exposed(query(add_process_groups)) def get_process_groups (self, specs): self._get_exit_status() return self.process_groups.q_get(specs) get_process_groups = exposed(query(get_process_groups)) def _get_exit_status (self): children = {} cleanup = {} _get_exit_status = automatic(_get_exit_status, float(get_orcm_system_config('get_exit_status_interval', 10))) def wait_process_groups (self, specs): process_groups = self.process_groups.q_get(specs) return process_groups wait_process_groups = locking(exposed(query(wait_process_groups))) def signal_process_groups (self, specs, signame="SIGINT"): my_process_groups = self.process_groups.q_get(specs) for pg in my_process_groups: OrcmBaseSystem.cancel_session(self, pg.jobid) pg.exit_status = 0 for host in pg.location: self.running_nodes.discard(host) return my_process_groups signal_process_groups = exposed(query(signal_process_groups)) def del_process_groups(self, jobid): '''delete a process group and don't track it anymore. jobid -- jobid associated with the process group we are removing ''' del_items = self.process_groups.q_del([{'jobid':jobid}]) if del_items == []: self.logger.warning("Job %s: Process group not found for this jobid.", jobid) else: self.logger.info("Job %s: Process group deleted.", jobid) return
class ClusterSystem (ClusterBaseSystem): """cluster system component. Methods: configure -- load partitions from the bridge API add_process_groups -- add (start) an mpirun process on the system (exposed, ~query) get_process_groups -- retrieve mpirun processes (exposed, query) wait_process_groups -- get process groups that have exited, and remove them from the system (exposed, query) signal_process_groups -- send a signal to the head process of the specified process groups (exposed, query) update_partition_state -- update partition state from the bridge API (runs as a thread) """ name = "system" implementation = "cluster_system" logger = logger def __init__ (self, *args, **kwargs): ClusterBaseSystem.__init__(self, *args, **kwargs) self.process_groups.item_cls = ClusterProcessGroup def __setstate__(self, state): ClusterBaseSystem.__setstate__(self, state) self.process_groups.item_cls = ClusterProcessGroup def add_process_groups (self, specs): """Create a process group. Arguments: spec -- dictionary hash specifying a process group to start """ self.logger.info("add_process_groups(%r)", specs) process_groups = self.process_groups.q_add(specs) for pgroup in process_groups: self.logger.info("job %s/%s: process group %s created to track script", pgroup.jobid, pgroup.user, pgroup.id) return process_groups add_process_groups = exposed(query(add_process_groups)) def get_process_groups (self, specs): self._get_exit_status() return self.process_groups.q_get(specs) get_process_groups = exposed(query(get_process_groups)) def _get_exit_status (self): try: running = ComponentProxy("forker").active_list() except: self.logger.error("failed to contact forker component for list of running jobs") return for each in self.process_groups.itervalues(): if each.head_pid not in running and each.exit_status is None: # FIXME: i bet we should consider a retry thing here -- if we fail enough times, just # assume the process is dead? or maybe just say there's no exit code the first time it happens? # maybe the second choice is better try: dead_dict = ComponentProxy("forker").get_status(each.head_pid) except Queue.Empty: self.logger.error("failed call for get_status from forker component for pg %s", each.head_pid) return if dead_dict is None: self.logger.info("process group %i: job %s/%s exited with unknown status", each.id, each.jobid, each.user) each.exit_status = 1234567 else: each.exit_status = dead_dict["exit_status"] if dead_dict["signum"] == 0: self.logger.info("process group %i: job %s/%s exited with status %i", each.id, each.jobid, each.user, each.exit_status) else: if dead_dict["core_dump"]: core_dump_str = ", core dumped" else: core_dump_str = "" self.logger.info("process group %i: job %s/%s terminated with signal %s%s", each.id, each.jobid, each.user, dead_dict["signum"], core_dump_str) _get_exit_status = automatic(_get_exit_status) def wait_process_groups (self, specs): self._get_exit_status() process_groups = [pg for pg in self.process_groups.q_get(specs) if pg.exit_status is not None] for process_group in process_groups: thread.start_new_thread(self.clean_nodes, (process_group,)) return process_groups wait_process_groups = locking(exposed(query(wait_process_groups))) def signal_process_groups (self, specs, signame="SIGINT"): my_process_groups = self.process_groups.q_get(specs) for pg in my_process_groups: if pg.exit_status is None: try: ComponentProxy("forker").signal(pg.head_pid, signame) except: self.logger.error("Failed to communicate with forker when signalling job") return my_process_groups signal_process_groups = exposed(query(signal_process_groups)) def clean_nodes(self, pg): try: tmp_data = pwd.getpwnam(pg.user) groupid = tmp_data.pw_gid group_name = grp.getgrgid(groupid)[0] except KeyError: group_name = "" self.logger.error("Job %s/%s unable to determine group name for epilogue" % (pg.jobid, pg.user)) processes = [] for host in pg.location: h = host.split(":")[0] try: p = subprocess.Popen(["/usr/bin/ssh", h, pg.config.get("epilogue"), str(pg.jobid), pg.user, group_name], stdout=subprocess.PIPE, stderr=subprocess.PIPE) p.host = h processes.append(p) except: self.logger.error("Job %s/%s failed to run epilogue on host %s", pg.jobid, pg.user, h, exc_info=True) start = time.time() dirty_nodes = [] while True: running = False for p in processes: if p.poll() is None: running = True break if not running: break if time.time() - start > float(pg.config.get("epilogue_timeout")): for p in processes: if p.poll() is None: try: os.kill(p.pid, signal.SIGTERM) dirty_nodes.append(p.host) self.logger.error("Job %s/%s epilogue timed out on host %s" % (pg.jobid, pg.user, p.host)) except: self.logger.error("epilogue for %s already terminated" %p.host) break else: time.sleep(5) for p in processes: if p.poll() > 0: self.logger.error("epilogue failed for host %s", p.host) self.logger.error("stderr from epilogue on host %s: [%s]", p.host, p.stderr.read().strip()) self.lock.acquire() try: self.logger.info("job finished on %s", Cobalt.Util.merge_nodelist(pg.location)) for host in pg.location: self.running_nodes.discard(host) if dirty_nodes: for host in dirty_nodes: self.down_nodes.add(host) self.logger.info("epilogue timed out, marking host %s down" % host) p = subprocess.Popen([pg.config.get("epi_epilogue"), str(pg.jobid), pg.user, group_name] + dirty_nodes) del self.process_groups[pg.id] except: self.logger.error("error in clean_nodes", exc_info=True) self.lock.release()
class BGSched(Component): implementation = "bgsched" name = "scheduler" logger = logging.getLogger("Cobalt.Components.scheduler") _configfields = ['utility_file'] _config = ConfigParser.ConfigParser() print Cobalt.CONFIG_FILES _config.read(Cobalt.CONFIG_FILES) if not _config._sections.has_key('bgsched'): print '''"bgsched" section missing from cobalt config file''' sys.exit(1) config = _config._sections['bgsched'] mfields = [field for field in _configfields if not config.has_key(field)] if mfields: print "Missing option(s) in cobalt config file [bgsched] section: %s" % ( " ".join(mfields)) sys.exit(1) if config.get("default_reservation_policy"): global DEFAULT_RESERVATION_POLICY DEFAULT_RESERVATION_POLICY = config.get("default_reservation_policy") def __init__(self, *args, **kwargs): Component.__init__(self, *args, **kwargs) self.COMP_QUEUE_MANAGER = "queue-manager" self.COMP_SYSTEM = "system" self.reservations = ReservationDict() self.queues = QueueDict(self.COMP_QUEUE_MANAGER) self.jobs = JobDict(self.COMP_QUEUE_MANAGER) self.started_jobs = {} self.sync_state = Cobalt.Util.FailureMode("Foreign Data Sync") self.active = True self.get_current_time = time.time self.id_gen = IncrID() global bgsched_id_gen bgsched_id_gen = self.id_gen self.cycle_id_gen = IncrID() global bgsched_cycle_id_gen bgsched_cycle_id_gen = self.cycle_id_gen def __getstate__(self): return { 'reservations': self.reservations, 'version': 1, 'active': self.active, 'next_res_id': self.id_gen.idnum + 1, 'next_cycle_id': self.cycle_id_gen.idnum + 1, 'msg_queue': dbwriter.msg_queue, 'overflow': dbwriter.overflow } def __setstate__(self, state): self.reservations = state['reservations'] if 'active' in state: self.active = state['active'] else: self.active = True self.id_gen = IncrID() self.id_gen.set(state['next_res_id']) global bgsched_id_gen bgsched_id_gen = self.id_gen self.cycle_id_gen = IncrID() self.cycle_id_gen.set(state['next_cycle_id']) global bgsched_cycle_id_gen bgsched_cycle_id_gen = self.cycle_id_gen self.queues = QueueDict(self.COMP_QUEUE_MANAGER) self.jobs = JobDict(self.COMP_QUEUE_MANAGER) self.started_jobs = {} self.sync_state = Cobalt.Util.FailureMode("Foreign Data Sync") self.get_current_time = time.time self.lock = threading.Lock() self.statistics = Statistics() if state.has_key('msg_queue'): dbwriter.msg_queue = state['msg_queue'] if state.has_key('overflow') and (dbwriter.max_queued != None): dbwriter.overflow = state['overflow'] # order the jobs with biggest utility first def utilitycmp(self, job1, job2): return -cmp(job1.score, job2.score) def prioritycmp(self, job1, job2): """Compare 2 jobs first using queue priority and then first-in, first-out.""" val = cmp(self.queues[job1.queue].priority, self.queues[job2.queue].priority) if val == 0: return self.fifocmp(job1, job2) else: # we want the higher priority first return -val def fifocmp(self, job1, job2): """Compare 2 jobs for first-in, first-out.""" def fifo_value(job): if job.index is not None: return int(job.index) else: return job.jobid # Implement some simple variations on FIFO scheduling # within a particular queue, based on queue policy fifoval = cmp(fifo_value(job1), fifo_value(job2)) if (job1.queue == job2.queue): qpolicy = self.queues[job1.queue].policy sizeval = cmp(int(job1.nodes), int(job2.nodes)) wtimeval = cmp(int(job1.walltime), int(job2.walltime)) if (qpolicy == 'largest-first' and sizeval): return -sizeval elif (qpolicy == 'smallest-first' and sizeval): return sizeval elif (qpolicy == 'longest-first' and wtimeval): return -wtimeval elif (qpolicy == 'shortest-first' and wtimeval): return wtimeval else: return fifoval else: return fifoval return cmp(fifo_value(job1), fifo_value(job2)) def save_me(self): Component.save(self) save_me = automatic(save_me) #user_name in this context is the user setting/modifying the res. def add_reservations(self, specs, user_name): self.logger.info("%s adding reservation: %r" % (user_name, specs)) added_reservations = self.reservations.q_add(specs) for added_reservation in added_reservations: self.logger.info("Res %s/%s: %s adding reservation: %r" % (added_reservation.res_id, added_reservation.cycle_id, user_name, specs)) dbwriter.log_to_db(user_name, "creating", "reservation", added_reservation) return added_reservations add_reservations = exposed(query(add_reservations)) def del_reservations(self, specs, user_name): self.logger.info("%s releasing reservation: %r" % (user_name, specs)) del_reservations = self.reservations.q_del(specs) for del_reservation in del_reservations: self.logger.info("Res %s/%s/: %s releasing reservation: %r" % (del_reservation.res_id, del_reservation.cycle_id, user_name, specs)) #dbwriter.log_to_db(user_name, "ending", "reservation", del_reservation) return del_reservations del_reservations = exposed(query(del_reservations)) def get_reservations(self, specs): return self.reservations.q_get(specs) get_reservations = exposed(query(get_reservations)) def set_reservations(self, specs, updates, user_name): log_str = "%s modifying reservation: %r with updates %r" % ( user_name, specs, updates) self.logger.info(log_str) #handle defers as a special case: have to log these, and not drop a mod record. def _set_reservations(res, newattr): res.update(newattr) updates['__cmd_user'] = user_name mod_reservations = self.reservations.q_get(specs, _set_reservations, updates) for mod_reservation in mod_reservations: self.logger.info("Res %s/%s: %s modifying reservation: %r" % (mod_reservation.res_id, mod_reservation.cycle_id, user_name, specs)) return mod_reservations set_reservations = exposed(query(set_reservations)) def release_reservations(self, specs, user_name): self.logger.info("%s requested release of reservation: %r" % (user_name, specs)) self.logger.info("%s releasing reservation: %r" % (user_name, specs)) rel_res = self.get_reservations(specs) for res in rel_res: dbwriter.log_to_db(user_name, "released", "reservation", res) del_reservations = self.reservations.q_del(specs) for del_reservation in del_reservations: self.logger.info("Res %s/%s/: %s releasing reservation: %r" % (del_reservation.res_id, del_reservation.cycle_id, user_name, specs)) return del_reservations release_reservations = exposed(query(release_reservations)) def check_reservations(self): ret = "" reservations = self.reservations.values() for i in range(len(reservations)): for j in range(i + 1, len(reservations)): # if at least one reservation is cyclic, we want *that* reservation to be the one getting its overlaps method # called if reservations[i].cycle is not None: res1 = reservations[i] res2 = reservations[j] else: res1 = reservations[j] res2 = reservations[i] # we subtract a little bit because the overlaps method isn't really meant to do this # it will report warnings when one reservation starts at the same time another ends if res1.overlaps(res2.start, res2.duration - 0.00001): # now we need to check for overlap in space results = ComponentProxy(self.COMP_SYSTEM).get_partitions( [{ 'name': p, 'children': '*', 'parents': '*' } for p in res2.partitions.split(":")]) for p in res1.partitions.split(":"): for r in results: if p == r['name'] or p in r['children'] or p in r[ 'parents']: ret += "Warning: reservation '%s' overlaps reservation '%s'\n" % ( res1.name, res2.name) return ret check_reservations = exposed(check_reservations) def sync_data(self): started = self.get_current_time() for item in [self.jobs, self.queues]: try: item.Sync() except (ComponentLookupError, xmlrpclib.Fault): # the ForeignDataDicts already include FailureMode stuff pass # print "took %f seconds for sync_data" % (time.time() - started, ) #sync_data = automatic(sync_data) def _run_reservation_jobs(self, reservations_cache): # handle each reservation separately, as they shouldn't be competing for resources for cur_res in reservations_cache.itervalues(): #print "trying to run res jobs in", cur_res.name, self.started_jobs queue = cur_res.queue if not (self.queues.has_key(queue) and self.queues[queue].state == 'running'): continue temp_jobs = self.jobs.q_get([{ 'is_runnable': True, 'queue': queue }]) active_jobs = [] for j in temp_jobs: if not self.started_jobs.has_key( j.jobid) and cur_res.job_within_reservation(j): active_jobs.append(j) if not active_jobs: continue active_jobs.sort(self.utilitycmp) job_location_args = [] for job in active_jobs: job_location_args.append({ 'jobid': str(job.jobid), 'nodes': job.nodes, 'queue': job.queue, 'required': cur_res.partitions.split(":"), 'utility_score': job.score, 'walltime': job.walltime, 'attrs': job.attrs, 'user': job.user, }) # there's no backfilling in reservations try: best_partition_dict = ComponentProxy( self.COMP_SYSTEM).find_job_location(job_location_args, []) except: self.logger.error("failed to connect to system component") best_partition_dict = {} for jobid in best_partition_dict: job = self.jobs[int(jobid)] self._start_job(job, best_partition_dict[jobid], {str(job.jobid): cur_res.res_id}) def _start_job(self, job, partition_list, resid=None): """Get the queue manager to start a job.""" cqm = ComponentProxy(self.COMP_QUEUE_MANAGER) try: self.logger.info("trying to start job %d on partition %r" % (job.jobid, partition_list)) cqm.run_jobs([{ 'tag': "job", 'jobid': job.jobid }], partition_list, None, resid) except ComponentLookupError: self.logger.error("failed to connect to queue manager") return self.started_jobs[job.jobid] = self.get_current_time() def schedule_jobs(self): '''look at the queued jobs, and decide which ones to start''' started_scheduling = self.get_current_time() if not self.active: return self.sync_data() # if we're missing information, don't bother trying to schedule jobs if not (self.queues.__oserror__.status and self.jobs.__oserror__.status): self.sync_state.Fail() return self.sync_state.Pass() self.lock.acquire() try: # cleanup any reservations which have expired for res in self.reservations.values(): if res.is_over(): self.logger.info("reservation %s has ended; removing" % (res.name)) self.logger.info("Res %s/%s: Ending reservation: %r" % (res.res_id, res.cycle_id, res.name)) #dbwriter.log_to_db(None, "ending", "reservation", # res) del_reservations = self.reservations.q_del([{ 'name': res.name }]) reservations_cache = self.reservations.copy() except: # just to make sure we don't keep the lock forever self.logger.error("error in schedule_jobs", exc_info=True) self.lock.release() # clean up the started_jobs cached data # TODO: Make this tunable. now = self.get_current_time() for job_name in self.started_jobs.keys(): if (now - self.started_jobs[job_name]) > 60: del self.started_jobs[job_name] active_queues = [] spruce_queues = [] res_queues = set() for item in reservations_cache.q_get([{'queue': '*'}]): if self.queues.has_key(item.queue): if self.queues[item.queue].state == 'running': res_queues.add(item.queue) for queue in self.queues.itervalues(): if queue.name not in res_queues and queue.state == 'running': if queue.policy == "high_prio": spruce_queues.append(queue) else: active_queues.append(queue) # handle the reservation jobs that might be ready to go self._run_reservation_jobs(reservations_cache) # figure out stuff about queue equivalence classes if __running_mode__ == "simulation": equiv = [{'reservations': [], 'queues': ['default']}] else: res_info = {} for cur_res in reservations_cache.values(): res_info[cur_res.name] = cur_res.partitions try: equiv = ComponentProxy( self.COMP_SYSTEM).find_queue_equivalence_classes( res_info, [q.name for q in active_queues + spruce_queues]) except: self.logger.error("failed to connect to system component") return for eq_class in equiv: # recall that is_runnable is True for certain types of holds temp_jobs = self.jobs.q_get([{'is_runnable':True, 'queue':queue.name} for queue in active_queues \ if queue.name in eq_class['queues']]) active_jobs = [] for j in temp_jobs: if not self.started_jobs.has_key(j.jobid): active_jobs.append(j) temp_jobs = self.jobs.q_get([{'is_runnable':True, 'queue':queue.name} for queue in spruce_queues \ if queue.name in eq_class['queues']]) spruce_jobs = [] for j in temp_jobs: if not self.started_jobs.has_key(j.jobid): spruce_jobs.append(j) # if there are any pending jobs in high_prio queues, those are the only ones that can start if spruce_jobs: active_jobs = spruce_jobs # get the cutoff time for backfilling # # BRT: should we use 'has_resources' or 'is_active'? has_resources returns to false once the resource epilogue # scripts have finished running while is_active only returns to false once the job (not just the running task) has # completely terminated. the difference is likely to be slight unless the job epilogue scripts are heavy weight. temp_jobs = [ job for job in self.jobs.q_get([{ 'has_resources': True }]) if job.queue in eq_class['queues'] ] end_times = [] for job in temp_jobs: # take the max so that jobs which have gone overtime and are being killed # continue to cast a small backfilling shadow (we need this for the case # that the final job in a drained partition runs overtime -- which otherwise # allows things to be backfilled into the drained partition) ##*AdjEst* if running_job_walltime_prediction: runtime_estimate = float(job.walltime_p) else: runtime_estimate = float(job.walltime) end_time = max( float(job.starttime) + 60 * runtime_estimate, now + 5 * 60) end_times.append([job.location, end_time]) for res_name in eq_class['reservations']: cur_res = reservations_cache[res_name] if not cur_res.cycle: end_time = float(cur_res.start) + float(cur_res.duration) else: done_after = float(cur_res.duration) - ( (now - float(cur_res.start)) % float(cur_res.cycle)) if done_after < 0: done_after += cur_res.cycle end_time = now + done_after if cur_res.is_active(): for part_name in cur_res.partitions.split(":"): end_times.append([[part_name], end_time]) if not active_jobs: continue active_jobs.sort(self.utilitycmp) # now smoosh lots of data together to be passed to the allocator in the system component job_location_args = [] for job in active_jobs: forbidden_locations = set() for res_name in eq_class['reservations']: cur_res = reservations_cache[res_name] if cur_res.overlaps(self.get_current_time(), 60 * float(job.walltime) + SLOP_TIME): forbidden_locations.update( cur_res.partitions.split(":")) job_location_args.append({ 'jobid': str(job.jobid), 'nodes': job.nodes, 'queue': job.queue, 'forbidden': list(forbidden_locations), 'utility_score': job.score, 'walltime': job.walltime, 'walltime_p': job.walltime_p, #*AdjEst* 'attrs': job.attrs, 'user': job.user, }) try: best_partition_dict = ComponentProxy( self.COMP_SYSTEM).find_job_location( job_location_args, end_times) except: self.logger.error("failed to connect to system component", exc_info=True) best_partition_dict = {} for jobid in best_partition_dict: job = self.jobs[int(jobid)] self._start_job(job, best_partition_dict[jobid]) # print "took %f seconds for scheduling loop" % (time.time() - started_scheduling, ) schedule_jobs = locking(automatic(schedule_jobs)) def get_resid(self, queue_name): return None get_resid = exposed(get_resid) def enable(self, user_name): """Enable scheduling""" self.logger.info("%s enabling scheduling", user_name) self.active = True enable = exposed(enable) def disable(self, user_name): """Disable scheduling""" self.logger.info("%s disabling scheduling", user_name) self.active = False disable = exposed(disable) def set_res_id(self, id_num): """Set the reservation id number.""" self.id_gen.set(id_num) logger.info("Reset res_id generator to %s." % id_num) set_res_id = exposed(set_res_id) def set_cycle_id(self, id_num): """Set the cycle id number.""" self.cycle_id_gen.set(id_num) logger.info("Reset cycle_id generator to %s." % id_num) set_cycle_id = exposed(set_cycle_id) def force_res_id(self, id_num): """Override the id-generator and change the resid to id_num""" self.id_gen.idnum = id_num - 1 logger.warning("Forced res_id generator to %s." % id_num) force_res_id = exposed(force_res_id) def force_cycle_id(self, id_num): """Override the id-generator and change the cycleid to id_num""" self.cycle_id_gen.idnum = id_num - 1 logger.warning("Forced cycle_id generator to %s." % id_num) force_cycle_id = exposed(force_cycle_id) def get_next_res_id(self): """Get what the next resid number would be""" return self.id_gen.idnum + 1 get_next_res_id = exposed(get_next_res_id) def get_next_cycle_id(self): """get what the next cycleid number would be""" return self.cycle_id_gen.idnum + 1 get_next_cycle_id = exposed(get_next_cycle_id) def __flush_msg_queue(self): """Send queued messages to the database-writer component""" dbwriter.flush_queue() __flush_msg_queue = automatic( __flush_msg_queue, float(get_bgsched_config('db_flush_interval', 10)))
# until the job has exhausted its maximum alloted time del self.process_groups[process_group.id] raise except: self.logger.error("%s: an unexpected exception occurred while attempting to start the process group " "using the %s component; releasing resources", pgroup.label, pgroup.forker, exc_info=True) self.reserve_resources_until(pgroup.location, None, pgroup.jobid) pgroup.exit_status = 255 else: self.logger.error("%s: the internal reservation on %s expired; job has been terminated", pgroup.label, pgroup.location) pgroup.exit_status = 255 return process_groups add_process_groups = exposed(query(all_fields=True)(add_process_groups)) def get_process_groups (self, specs): """Query process_groups from the simulator.""" self._get_exit_status() return self.process_groups.q_get(specs) get_process_groups = exposed(query(get_process_groups)) def _get_exit_status (self): #common to bgsystem running = [] active_forker_components = []
class Qsimulator(Simulator): '''Cobalt Queue Simulator''' implementation = "qsim" name = "queue-manager" alias = Simulator.name def __init__(self, *args, **kwargs): print "kwargs= ", kwargs #initialize partitions Simulator.__init__(self, *args, **kwargs) partnames = self._partitions.keys() self.init_partition(partnames) self.part_size_list = [] for part in self.partitions.itervalues(): if int(part.size) not in self.part_size_list: self.part_size_list.append(int(part.size)) self.part_size_list.sort() #get command line parameters self.FAILURE_FREE = True self.FRACTION = kwargs.get("fraction", 1) self.workload_file = kwargs.get("workload") self.output_log = kwargs.get("outputlog") self.failure_log = kwargs.get('failurelog') self.weibull = kwargs.get('weibull') if self.weibull: self.SCALE = float(kwargs.get('scale')) if self.SCALE == 0: self.SCALE = default_SCALE self.SHAPE = float(kwargs.get('shape')) if self.SHAPE == 0: self.SHAPE = default_SHAPE self.fault_aware = kwargs.get('faultaware') self.SENSITIVITY = default_SENSITIVITY self.SPECIFICITY = default_SPECIFICITY if self.fault_aware: self.SENSITIVITY = float(kwargs.get('sensitivity', default_SENSITIVITY)) self.SPECIFICITY = float(kwargs.get('specificity', defalt_SPECIFICITY)) if self.failure_log or self.weibull: self.FAILURE_FREE = False #initialize time stamps and job queues #time stamp format: ('EVENT', 'time_stamp_date', time_stamp_second, {'job_id':str(jobid), 'location':[partition1, partition2,...]}) self.time_stamps = [('I', '0', 0, {})] self.cur_time_index = 0 self.queues = SimQueueDict(policy=kwargs['policy']) self.init_queues() self.visible_jobs = [] #initialize failures self.failure_dict = {} if not self.FAILURE_FREE: if self.failure_log: #if specified failure log, use log trace failure self.inject_failures() elif self.weibull: #else MAKE failures by Weibull distribution self.make_failures() #initialize PBS-style logger self.pbslog = PBSlogger(self.output_log) #initialize debug logger self.dbglog = PBSlogger(self.output_log+"-debug") #finish tag self.finished = False #tag for controlling time stamp increment self.increment_tag = True #register local alias "system" for this component local_components["system"] = self print "Simulation starts:" def register_alias(self): '''register alternate name for the Qsimulator, by registering in slp with another name for the same location. in this case 'system' is the alternate name''' try: slp = Cobalt.Proxy.ComponentProxy("service-location", defer=False) except ComponentLookupError: print >> sys.stderr, "unable to find service-location" qsim_quit() svc_location = slp.locate(self.name) if svc_location: slp.register(self.alias, svc_location) register_alias = automatic(register_alias, 30) def is_finished(self): return self.finished is_finished = exposed(is_finished) def init_partition(self, namelist): '''add all paritions and apply activate and enable''' func = self.add_partitions args = ([{'tag':'partition', 'name':partname, 'size':"*", 'functional':False, 'scheduled':False, 'queue':"*", 'deps':[]} for partname in namelist],) apply(func, args) func = self.set_partitions args = ([{'tag':'partition', 'name':partname} for partname in namelist], {'scheduled':True, 'functional': True}) apply(func, args) def get_current_time_event(self): return self.time_stamps[self.cur_time_index][0] def get_current_time(self): '''get current time in date format''' return self.time_stamps[self.cur_time_index][1] def get_current_time_sec(self): return self.time_stamps[self.cur_time_index][2] get_current_time_sec = exposed(get_current_time_sec) def get_current_time_job(self): ret = None if self.time_stamps[self.cur_time_index][3].has_key('jobid'): ret = self.time_stamps[self.cur_time_index][3]['jobid'] return ret def get_current_time_partition(self): if self.get_current_time_event() in set(["R","S"]): return self.time_stamps[self.cur_time_index][3]['location'] else: return None def get_current_time_stamp(self): '''get current time stamp index''' return self.cur_time_index get_current_time_stamp = exposed(get_current_time_stamp) def get_current_time_stamp_tuple(self): return self.time_stamps[self.cur_time_index] def time_increment(self): '''the current time stamp increments by 1''' if self.cur_time_index < len(self.time_stamps) - 1: self.cur_time_index += 1 print " " print str(self.get_current_time()) + \ " Time stamp is incremented by 1, current time stamp: " + \ str(self.cur_time_index) else: print str(self.get_current_time()) +\ " Reached maximum time stamp: %s, simulating finished! " \ % (str(self.cur_time_index)) self.finished = True self.pbslog.closeLog() qsim_quit() #simulation completed, exit!!! return self.cur_time_index def insert_time_stamp(self, new_time_date, event, info): '''insert time stamps in the same order''' if event not in SET_event: print "invalid event type,", event return new_time_sec = date_to_sec(new_time_date) new_time_tuple = (event, new_time_date, new_time_sec, info) pos = len(self.time_stamps) while new_time_sec < self.time_stamps[pos-1][2]: pos = pos - 1 self.time_stamps.insert(pos, new_time_tuple) #print "insert time stamp ", new_time_tuple, " at pos ", pos return pos def init_queues(self): '''parses the work load log file, initializes queues and sorted time stamp list''' raw_jobs = parse_work_load(self.workload_file) specs = [] tag = 0 for key in raw_jobs: spec = {'valid':True} tmp = raw_jobs[key] spec['jobid'] = tmp.get('jobid') spec['queue'] = tmp.get('queue') #convert submittime from "%m/%d/%Y %H:%M:%S" to Unix time sec format_sub_time = tmp.get('submittime') if format_sub_time: spec['submittime'] = date_to_sec(format_sub_time) spec['first_subtime'] = spec['submittime'] #set the first submit time else: spec['valid'] = False #convert walltime from 'hh:mm:ss' to float of minutes format_walltime = tmp.get('Resource_List.walltime') spec['walltime'] = 0 if format_walltime: segs = format_walltime.split(':') spec['walltime'] = str(int(segs[0])*60 + int(segs[1])) else: #invalid job entry, discard spec['valid'] = False if tmp.get('start') and tmp.get('end'): act_run_time = float(tmp.get('end')) - float(tmp.get('start')) spec['runtime'] = str(round(act_run_time, 1)) if IDEALWALLTIME: wtime = (round(act_run_time / 60, 2) + float(spec['walltime']))/2 #wtime = act_run_time / 60 spec['walltime'] = str(round(wtime, 2)) else: spec['valid'] = False if tmp.get('Resource_List.nodect'): spec['nodes'] = tmp.get('Resource_List.nodect') else: #invalid job entry, discard spec['valid'] = False if tmp.get('user'): spec['user'] = tmp.get('user') if tmp.get('project'): spec['project'] = tmp.get('project') spec['state'] = 'invisible' spec['start_time'] = '0' spec['end_time'] = '0' #add the job spec to the spec list if spec['valid'] == True: specs.append(spec) #adjust workload density if FRACTION != 1: tune_workload(specs, FRACTION) print "workload adjusted: last submit job=", specs[len(specs)-1].get('submittime') print "Initializing jobs and time stamps list, wait one moment... ..." for spec in specs: format_sub_time = sec_to_date(spec['submittime']) if not self.time_stamps.__contains__(format_sub_time): self.insert_time_stamp(format_sub_time, 'Q', {'jobid':str(spec['jobid'])}) print "total job number:", len(specs) self.add_jobs(specs) return 0 def log_job_event(self, eventtype, timestamp, spec): '''log job events(Queue,Start,End) to PBS-style log''' def len2 (_input): _input = str(_input) if len(_input) == 1: return "0" + _input else: return _input if eventtype == 'Q': #submitted(queued) for the first time message = "%s;Q;%d;queue=%s" % (timestamp, spec['jobid'], spec['queue']) elif eventtype == 'R': #resume running after failure recovery message = "%s;R;%s" % (timestamp, ":".join(spec['location'])) else: wall_time = spec['walltime'] walltime_minutes = len2(int(float(wall_time)) % 60) walltime_hours = len2(int(float(wall_time)) // 60) log_walltime = "%s:%s:00" % (walltime_hours, walltime_minutes) if eventtype == 'S': #start running message = "%s;S;%d;queue=%s qtime=%s Resource_List.nodect=%s Resource_List.walltime=%s start=%s exec_host=%s" % \ (timestamp, spec['jobid'], spec['queue'], spec['submittime'], spec['nodes'], log_walltime, spec['start_time'], ":".join(spec['location'])) elif eventtype == 'E': #end message = "%s;E;%d;queue=%s qtime=%s Resource_List.nodect=%s Resource_List.walltime=%s start=%s end=%f exec_host=%s runtime=%s" % \ (timestamp, spec['jobid'], spec['queue'], spec['submittime'], spec['nodes'], log_walltime, spec['start_time'], round(float(spec['end_time']), 1), ":".join(spec['location']), spec['runtime']) elif eventtype == 'F': #failure frag_runtime = round(float(spec['failure_time']) - float(spec['start_time']), 1) #running time before failure(after the latest start) message = "%s;F;%d;queue=%s qtime=%s Resource_List.nodect=%s Resource_List.walltime=%s exec_host=%s start=%s frag_runtime=%s complete=%f" % \ (timestamp, spec['jobid'], spec['queue'], spec['submittime'], spec['nodes'], log_walltime, ":".join(spec['location']), spec['start_time'], frag_runtime, round(frag_runtime / float(spec['runtime']), 2) ) elif eventtype == 'P': #pending message = "%s;P;%d;queue=%s qtime=%s Resource_List.nodect=%s Resource_List.walltime=%s exec_host=%s start=%s" % \ (timestamp, spec['jobid'], spec['queue'], spec['submittime'], spec['nodes'], log_walltime, ":".join(spec['location']), spec['start_time'], ) else: print "invalid event type, type=", type return self.pbslog.LogMessage(message) def get_new_states(self, jobspec): '''return the new state updates of a specific job at specific time stamp, including invisible->queued, running->ended''' updates = {} curstate = jobspec['state'] newstate = curstate job_id = jobspec['jobid'] cur_event = self.get_current_time_event() #handle job submssion event if cur_event == 'Q' and curstate == "invisible": newstate = "queued" updates['is_runnable'] = True updates['is_visible'] = True self.log_job_event('Q', self.get_current_time(), jobspec) #handle job completion event elif cur_event == 'E' and curstate == "running": newstate = "ended" updates['is_runnable'] = False updates['has_resources'] = False updates['is_visible'] = False #release partition immediately partitions = jobspec['location'] for partition in partitions: self.release_partition(partition) self.queues.del_jobs([{'jobid':job_id}]) #write to output log if jobspec['end_time']: end = float(jobspec['end_time']) else: end = 0 end_datetime = sec_to_date(end) self.log_job_event('E', end_datetime, jobspec) #handle job failure event elif cur_event == 'F' and curstate == "running": print "entered failure handling" #release partition partitions = jobspec['location'] for partition in partitions: print "partition %s start repairing" % (partition) self.start_repair_partition(partition) #write to output log if jobspec['failure_time']: fail = float(jobspec['failure_time']) else: fail = 0 failure_datetime = sec_to_date(fail) self.log_job_event('F', failure_datetime, jobspec) print self.get_current_time(), " job %d failed at %s!!" % (job_id, ":".join(jobspec['location'])) rec_updates = self.recovery_mgr(jobspec) if not rec_updates == {}: updates.update(rec_updates) updates['has_resources'] = False if updates.has_key('state'): newstate = updates['state'] if CHECKPOINT: print "enter checkpoint handling****" #runtime before failed after latest start frag_runtime = float(jobspec['failure_time']) - float(jobspec['start_time']) updates['remain_time'] = jobspec['remain_time'] - frag_runtime else:#other event pass if updates and not curstate == newstate: print self.get_current_time(), "state changed, job", job_id, \ ":", curstate, "->", newstate updates['state'] = newstate return updates def update_job_states(self, specs, updates): '''update the state of the jobs associated to the current time stamp''' def _update_job_states(job, newattr): '''callback function to update job states''' temp = job.to_rx() newattr = self.get_new_states(temp) if newattr: temp.update(newattr) job.update(newattr) ids_str = self.get_current_time_job() ids = ids_str.split(':') cur_event = self.get_current_time_event() for id in ids: for spec in specs: spec['jobid'] = int(id) ret_jobs = self.queues.get_jobs(specs, _update_job_states, updates) if cur_event == "Q": self.visible_jobs.extend(ret_jobs) elif cur_event=="E": self.visible_jobs = [j for j in self.visible_jobs if j not in ret_jobs] return 0 def run_job_updates(self, jobspec, newattr): ''' return the state updates (including state queued -> running, setting the start_time, end_time)''' updates = {} #print "enter run_job_updates, jobspec=", jobspec start = self.get_current_time_sec() updates['start_time'] = start updates['starttime'] = start updates['state'] = 'running' updates['system_state'] = 'running' updates['is_runnable'] = False updates['has_resources'] = True print self.get_current_time(), "run job state change, job", jobspec['jobid'], \ ":", jobspec['state'], "->", updates['state'] #determine whether the job is going to fail before completion location = newattr['location'] duration = jobspec['remain_time'] #print "duration=", duration nearest_failure = self.get_next_failure(location, start, duration) if (nearest_failure): updates['failure_time'] = date_to_sec(nearest_failure) new_time_stamp = nearest_failure self.insert_time_stamp(new_time_stamp, 'F', {'jobid':str(jobspec['jobid'])}) else: # will complete end = start + duration updates['end_time'] = end new_time_stamp = sec_to_date(end) #print "new_time_stamp=", new_time_stamp self.insert_time_stamp(new_time_stamp, 'E', {'jobid':str(jobspec['jobid'])}) updates.update(newattr) return updates def start_job(self, specs, updates): '''update the job state and start_time and end_time when cqadm --run is issued to a group of jobs''' partitions = updates['location'] for partition in partitions: self.reserve_partition(partition) def _start_job(job, newattr): '''callback function to update job start/end time''' temp = job.to_rx() newattr = self.run_job_updates(temp, newattr) temp.update(newattr) job.update(newattr) self.log_job_event('S', self.get_current_time(), temp) return self.queues.get_jobs(specs, _start_job, updates) def add_jobs(self, specs): '''Add a job, currently for unit test only''' response = self.queues.add_jobs(specs) return response add_jobs = exposed(query(add_jobs)) def get_jobs(self, specs): '''get a list of jobs, each time triggers time stamp increment and job states update''' jobs = [] if self.increment_tag: self.time_increment() eventtype = self.get_current_time_event() print "current event type====", eventtype if eventtype == "R": self.release_repaired_partition() #if the repaired job associated with some pending jobs, #returen empty list to scheduler, in order to ensure the next #time stamp will restart the pending job other than scheduling other jobs at this time stamp #this will avoid run multiple jobs on the same partition(once a bug, solved) if self.get_current_time_job(): return jobs elif eventtype == "S": self.restart_pending_job() return jobs else: self.update_job_states(specs, {}) if len(self.recovering_jobs) > 0: self.update_recovering_jobs({}) self.increment_tag = True jobs = self.visible_jobs # print "running jobs=", [job.jobid for job in self.running_jobs] # print "queueing jobs=", [job.jobid for job in self.queuing_jobs] # print "visible jobs=", [job.jobid for job in self.visible_jobs] # print "return jobs=", len(jobs) return jobs get_jobs = exposed(query(get_jobs)) def update_recovering_jobs(self, updates): print "enter update_recovering_jobs()" def _update_recovering_jobs(job, newattr): '''callback function to update job states''' temp = job.to_rx() print "temp=", temp newattr = self.recovery_mgr(temp) print "update_recovering_jobs newattr=", newattr print "temp=", temp if newattr: temp.update(newattr) job.update(newattr) ids = [job.jobid for job in self.recovering_jobs] print "ids=", ids ret = self.queues.get_jobs([{'tag':"job", 'state': "recovering"}], _update_recovering_jobs, updates) return 0 def _get_queuing_jobs(self): return [job for job in self.visible_jobs if job.is_runnable==True] queuing_jobs = property(_get_queuing_jobs) def _get_running_jobs(self): return [job for job in self.visible_jobs if job.has_resources==True] running_jobs = property(_get_running_jobs) def _get_recovering_jobs(self): return self.queues.get_jobs([{'jobid':"*", 'state':"recovering"}]) recovering_jobs = property(_get_recovering_jobs) def get_visible_jobs(self): return self.visible_jobs; get_visible_jobs = exposed(get_visible_jobs) def get_running_jobs(self): return [job for job in self.visible_jobs if job.has_resources==True] get_running_jobs = exposed(get_running_jobs) def get_queuing_jobs(self): return [job for job in self.visible_jobs if job.is_runnable==True] get_queuing_jobs = exposed(get_queuing_jobs) def _get_job_by_id(self, jobid): jobs = self.queues.get_jobs([{'jobid':jobid}]) if len(jobs) == 1: return jobs[0] else: return None def add_queues(self, specs): '''add queues''' return self.queues.add_queues(specs) add_queues = exposed(query(add_queues)) def get_queues(self, specs): '''get queues''' return self.queues.get_queues(specs) get_queues = exposed(query(get_queues)) def run_jobs(self, specs, nodelist): '''run a queued job, by updating the job state, start_time and end_time''' print "run job specs=", specs, " on partion", nodelist if specs: self.start_job(specs, {'location': nodelist}) #set tag false, enable scheduling another job at the same time self.increment_tag = False #print "current running jobs=", [job.jobid for job in self.running_jobs] return self.running_jobs run_jobs = exposed(query(run_jobs)) def get_midplanes(self, partname): '''return a list of sub-partitions each contains 512-nodes(midplane)''' midplane_list = [] partition = self._partitions[partname] if partition.size == MIDPLANE_SIZE: midplane_list.append(partname) elif partition.size > MIDPLANE_SIZE: children = partition.children for part in children: if self._partitions[part].size == MIDPLANE_SIZE: midplane_list.append(part) else: parents = partition.parents for part in parents: if self._partitions[part].size == MIDPLANE_SIZE: midplane_list.append(part) return midplane_list def get_next_failure(self, location, now, duration): '''return the next(closest) failure moment according the partition failure list''' if (self.FAILURE_FREE): return None def _find_next_failure(partname, now): next = None failure_list = self.failure_dict[partname] if failure_list: for fail_time in failure_list: if date_to_sec(fail_time) > now: next = fail_time break return next closest_fail_sec = MAXINT partitions = location midplanes = set() for partition in partitions: tmp_midplanes = self.get_midplanes(partition) for item in tmp_midplanes: if item not in midplanes: midplanes.add(item) for midplane in midplanes: next = _find_next_failure(midplane, now) if (next): next_sec = date_to_sec(next) if next_sec < closest_fail_sec: closest_fail_sec =next_sec if closest_fail_sec == MAXINT: next_failure_date = None else: job_end_sec = now + duration if closest_fail_sec < job_end_sec: next_failure_date = sec_to_date(closest_fail_sec) else: next_failure_date = None #print "next_failure_date=", next_failure_date return next_failure_date def will_job_fail(self, mtbf, nodes, hours): '''simulate static failure chance, [not used]''' return False print "mtbf=%d, nodes=%d, hours=%f" % (mtbf,nodes,hours) failure_chance = 1 - (1 - hours * 1.0/mtbf) ** nodes if failure_chance > 0.7 : failure_chance = 0.7 random_num = random.random() print "failure chance=%f, random_num=%f" % (failure_chance, random_num) if random_num < failure_chance: return True else: return False def nodes_static(self): '''static the node requested by each job, [not used]''' jobs = self.queues.get_jobs([{'jobid':"*", 'queue':"*", 'nodes':"*"}]) nodesdict = {} for job in jobs: nodes = int(job.nodes) nodesstr = nodes if (nodesdict.has_key(nodesstr)): nodesdict[nodesstr] = nodesdict[nodesstr] + 1 else: nodesdict[nodesstr] = 1 keys = nodesdict.keys() keys.sort() for key in keys: print key, ":", nodesdict[key] def gen_failure_list(self, scale, shape, startdate, enddate): '''generate a synthetic failure time list based on weibull distribution and start/end date time''' failure_moments = [] ttf_list = [] start = date_to_sec(startdate) end = date_to_sec(enddate) cur_failure = start while True: ttf = random.weibullvariate(scale,shape) cur_failure += ttf if cur_failure < end: ttf_list.append(ttf) failure_moments.append(sec_to_date(cur_failure)) else: break return failure_moments, ttf_list def make_failures(self): '''generate failure lists for each 512-nodes partition''' ttf_dict = {} start = self.time_stamps[1][1] end = self.time_stamps[len(self.time_stamps)-1][1] for partition in self._partitions.values(): if partition.size == MIDPLANE_SIZE: fl, ttfs = self.gen_failure_list(self.SCALE, self.SHAPE, start, end) self.failure_dict[partition.name] = fl ttf_dict[partition.name] = ttfs partnames = self.failure_dict.keys() partnames.sort() f = open(default_FAILURE_LOG, "w") total_f = 0 mtbf = 0 for part in partnames: f_list = self.failure_dict[part] print part, " ", f_list f.write("%s;%s\n" % (part, ";".join(f_list))) total_f += len(f_list) ttfs = ttf_dict[part] if len(ttfs)==0: mtbf = 0 else: total = 0 for ttf in ttfs: total += ttf mtbf = total / len(ttfs) start_sec = date_to_sec(start) end_sec = date_to_sec(end) f.write("Total=%d\nMTBF=%f" % (total_f, (end_sec-start_sec)/(total_f*3600))) f.close() def inject_failures(self): '''parse failure trace log to make failure list for each 1-midplane partition''' raw_job_dict = {} partnames = set(self._partitions.keys()) flog = open(self.failure_log, "r") self.failure_dict = {} for line in flog: print "line=", line line = line.strip('\n') parsedline = line.split(";") print "parsedline=", parsedline failure_list = [] part = parsedline[0] if part in partnames: for i in range(1, len(parsedline)): failure_moment = parsedline[i] if len(failure_moment) == 0: continue failure_list.append(failure_moment) self.failure_dict[part] = failure_list partnames = self.failure_dict.keys() partnames.sort() for part in partnames: f_list = self.failure_dict[part] print part, " ", f_list def get_failure_chance(self, location, duration): now = date_to_sec(self.get_current_time()) next_fail = self.get_next_failure(location, now, duration) if (next_fail != None): return self.SENSITIVITY else: return 1 - self.SPECIFICITY get_failure_chance = exposed(get_failure_chance) def recovery_mgr(self, jobspec): """Recovery manager, this function can be extended to support various recovery options. at this version, the failed job is sent back to the rear of the queue. The extended code is ready and available at private code branch(wtang).""" updates = {} updates = self.handle_reque_rear(jobspec) recovery_option = jobspec['recovery_opt'] print "rec_opt=", recovery_option #if_else structure remains room for recovery option extending if recovery_option == 1: #resubmit the job #resubmit the job, the submit time changed to NOW updates = self.handle_reque_rear(jobspec) return updates def handle_reque_rear(self, jobspec): '''handle option 1 - resubmit the job to rear of waiting queue''' updates = {} updates['state'] = "queued" updates['start_time'] = 0 updates['submittime'] = self.get_current_time_sec() return updates def start_repair_partition(self, partname): '''partition failed, assuming get repaired MTTR seconds later''' now = self.get_current_time_sec() time_to_repair = now + MTTR time_to_repair_date = sec_to_date(time_to_repair) self.insert_time_stamp(time_to_repair_date, "R", {'location':partname}) def release_repaired_partition(self): '''enter release_repaired_partition() partition repaired''' partition = self.get_current_time_partition() if partition == None: return False self.release_partition(partition) print "partition %s gets repaired" % (partition) self.log_job_event('R', self.get_current_time(), {'location':partition}) return True def restart_pending_job(self): '''restart jobs that pending for the nodes repair''' partname = self.get_current_time_partition() print "enter restart_pending_job() partname=", partname ids_str = self.get_current_time_job() ids = ids_str.split(':') jobspecs = [] for id in ids: spec = {'tag':'job', 'jobid':int(id)} jobspecs.append(spec) print "restart pending job ", jobspecs, " on repaired partition ", partname self.run_jobs(jobspecs, [partname]) def possible_locations(self, job): '''find the partitions with the size that can right accomodates the job (returned partions are not necessarily idle)''' locations = [] proper_partsize = 64 job_nodes = int(job['nodes']) for psize in self.part_size_list: if psize >= job_nodes: proper_partsize = psize break for part in self.cached_partitions.itervalues(): if int(part.size) == proper_partsize: locations.append(part) return locations def _find_job_location(self, args, drain_partitions=set(), backfilling=False): jobid = args['jobid'] nodes = args['nodes'] queue = args['queue'] utility_score = args['utility_score'] walltime = args['walltime'] forbidden = args.get("forbidden", []) required = args.get("required", []) best_score = sys.maxint best_partition = None # get partitions of proper size as the candidates candidate_partitions = self.possible_locations(args) #exclude the partitions already drained if drain_partitions: candidate_partitions = [part for part in candidate_partitions if part not in drain_partitions] now = self.get_current_time_sec() for partition in candidate_partitions: #skip partitions that are not "idle" if partition.state != "idle": continue if backfilling: #skip the partition with too short cutoff to backfill the job if 60*float(walltime) > (partition.backfill_time - now): continue # let's check the impact on partitions that would become blocked score = 0 for p in partition.parents: if self.cached_partitions[p].state == "idle" and self.cached_partitions[p].scheduled: score += 1 for ch in partition.children: score += 0.01 if (FAULTAWARE): Pf = 0 Pf = self.get_failure_chance(partition.name, 60*float(walltime)) score += Pf # the lower the score, the fewer new partitions will be blocked by this selection if score < best_score: best_score = score best_partition = partition elif score == best_score: if partition.name > best_partition.name: best_partition = partition if best_partition: #print "return bestpartition=",{jobid: [best_partition.name, best_partition.state]} return {jobid: [best_partition.name]} def find_job_location(self, arg_list, end_times): best_partition_dict = {} if self.bridge_in_error: print "bridge_in_error" return {} self.cached_partitions = self.partitions # first, figure out backfilling cutoffs per partition (which we'll also # use for picking which partition to drain) job_end_times = {} for item in end_times: job_end_times[item[0][0]] = item[1] now = self.get_current_time_sec() for p in self.cached_partitions.itervalues(): if p.state == "idle": p.backfill_time = now else: p.backfill_time = now + 5*60 p.draining = False for p in self.cached_partitions.itervalues(): if p.name in job_end_times: if job_end_times[p.name] > p.backfill_time: p.backfill_time = job_end_times[p.name] for parent_name in p.parents: parent_partition = self.cached_partitions[parent_name] if p.backfill_time > parent_partition.backfill_time: parent_partition.backfill_time = p.backfill_time for p in self.cached_partitions.itervalues(): if p.backfill_time == now: continue for child_name in p.children: child_partition = self.cached_partitions[child_name] if child_partition.backfill_time == now or child_partition.backfill_time > p.backfill_time: child_partition.backfill_time = p.backfill_time # first time through, try for starting jobs based on utility scores drain_partitions = set() # the sets draining_jobs and cannot_start are for efficiency, not correctness draining_jobs = set() cannot_start = set() for idx in range(len(arg_list)): winning_job = arg_list[idx] for jj in range(idx, len(arg_list)): job = arg_list[jj] # this job isn't good enough! if job['utility_score'] < winning_job['threshold']: break if job['jobid'] not in cannot_start: partition_name = self._find_job_location(job, drain_partitions) if partition_name: best_partition_dict.update(partition_name) break cannot_start.add(job['jobid']) # we already picked a drain location for the winning job if winning_job['jobid'] in draining_jobs: continue location = self._find_drain_partition(winning_job) if location is not None: for p_name in location.parents: drain_partitions.add(self.cached_partitions[p_name]) for p_name in location.children: drain_partitions.add(self.cached_partitions[p_name]) self.cached_partitions[p_name].draining = True drain_partitions.add(location) #self.logger.info("job %s is draining %s" % (winning_job['jobid'], location.name)) #self.dbglog.LogMessage("job %s is draining %s" % (winning_job['jobid'], location.name)) location.draining = True draining_jobs.add(winning_job['jobid']) # at this time, we only want to try launching one job at a time if best_partition_dict: # msg = "idx=%s, jj=%s, job=%s, partition=%s" % (idx, jj, job['jobid'],best_partition_dict[job['jobid']]) #print msg # self.dbglog.LogMessage(msg) break # the next time through, try to backfill, but only if we couldn't find anything to start if not best_partition_dict: # arg_list.sort(self._walltimecmp) # msg = "try to backfill jobs..." # self.dbglog.LogMessage(msg) for args in arg_list: partition_name = self._find_job_location(args, backfilling=True) if partition_name: msg = "backfilling job %s(%s)" % (args['jobid'], args['nodes']) self.logger.info(msg) self.dbglog.LogMessage(msg) best_partition_dict.update(partition_name) break # reserve the stuff in the best_partition_dict, as those partitions are allegedly going to # be running jobs very soon # # also, this is the only part of finding a job location where we need to lock anything #self._partitions_lock.acquire() try: for p in self.partitions.itervalues(): # push the backfilling info from the local cache back to the real objects p.draining = self.cached_partitions[p.name].draining p.backfill_time = self.cached_partitions[p.name].backfill_time for partition_list in best_partition_dict.itervalues(): part = self.partitions[partition_list[0]] ##part.reserved_until = self.get_current_time_sec() + 5*60 part.state = "starting job" for p in part._parents: if p.state == "idle": p.state = "blocked by starting job" for p in part._children: if p.state == "idle": p.state = "blocked by starting job" except: self.logger.error("error in find_job_location", exc_info=True) #self._partitions_lock.release() #print "best_partition_dict=", best_partition_dict return best_partition_dict find_job_location = locking(exposed(find_job_location))
"%s: an unexpected exception occurred while attempting to start the process group " "using the %s component; releasing resources", pgroup.label, pgroup.forker, exc_info=True) self.reserve_resources_until(pgroup.location, None, pgroup.jobid) pgroup.exit_status = 255 else: self.logger.error( "%s: the internal reservation on %s expired; job has been terminated", pgroup.label, pgroup.location) pgroup.exit_status = 255 return process_groups add_process_groups = exposed(query(all_fields=True)(add_process_groups)) def get_process_groups(self, specs): """Query process_groups from the simulator.""" self._get_exit_status() return self.process_groups.q_get(specs) get_process_groups = exposed(query(get_process_groups)) def _get_exit_status(self): #common to bgsystem running = [] active_forker_components = [] for forker_component in ['bg_mpirun_forker', 'user_script_forker']:
class BGBaseSystem(Component): """base system class. Methods: add_partitions -- tell the system to manage partitions (exposed, query) get_partitions -- retrieve partitions in the simulator (exposed, query) del_partitions -- tell the system not to manage partitions (exposed, query) set_partitions -- change random attributes of partitions (exposed, query) update_relatives -- should be called when partitions are added and removed from the managed list """ def __init__(self, *args, **kwargs): Component.__init__(self, *args, **kwargs) self._partitions = PartitionDict() self._managed_partitions = set() self.process_groups = BGProcessGroupDict() self.node_card_cache = dict() self._partitions_lock = thread.allocate_lock() self.pending_diags = dict() self.failed_diags = list() self.bridge_in_error = False self.cached_partitions = None self.offline_partitions = [] def _get_partitions(self): return PartitionDict([(partition.name, partition) for partition in self._partitions.itervalues() if partition.name in self._managed_partitions]) partitions = property(_get_partitions) def add_partitions(self, specs, user_name=None): self.logger.info("%s called add_partitions(%r)", user_name, specs) specs = [{'name': spec.get("name")} for spec in specs] self._partitions_lock.acquire() try: partitions = [ partition for partition in self._partitions.q_get(specs) if partition.name not in self._managed_partitions ] except: partitions = [] self.logger.error("error in add_partitions", exc_info=True) self._partitions_lock.release() self._managed_partitions.update( [partition.name for partition in partitions]) self.update_relatives() return partitions add_partition = exposed(query(add_partitions)) def get_partitions(self, specs): """Query partitions on simulator.""" self._partitions_lock.acquire() try: partitions = self.partitions.q_get(specs) except: partitions = [] self.logger.error("error in get_partitions", exc_info=True) self._partitions_lock.release() return partitions get_partitions = exposed(query(get_partitions)) def verify_locations(self, location_list): """Providing a system agnostic interface for making sure a 'location string' is valid""" parts = self.get_partitions([{'name': l} for l in location_list]) return [p.name for p in parts] verify_locations = exposed(verify_locations) def del_partitions(self, specs, user_name=None): """Remove partitions from the list of managed partitions""" self.logger.info("%s called del_partitions(%r)", user_name, specs) self._partitions_lock.acquire() try: partitions = [ partition for partition in self._partitions.q_get(specs) if partition.name in self._managed_partitions ] except: partitions = [] self.logger.error("error in del_partitions", exc_info=True) self._partitions_lock.release() self._managed_partitions -= set( [partition.name for partition in partitions]) self.update_relatives() return partitions del_partitions = exposed(query(del_partitions)) def set_partitions(self, specs, updates, user_name=None): """Update random attributes on matching partitions""" def _set_partitions(part, newattr): self.logger.info("%s updating partition %s: %r", user_name, part.name, newattr) part.update(newattr) self._partitions_lock.acquire() try: partitions = self._partitions.q_get(specs, _set_partitions, updates) except: partitions = [] self.logger.error("error in set_partitions", exc_info=True) self._partitions_lock.release() return partitions set_partitions = exposed(query(set_partitions)) def update_relatives(self): """Call this method after changing the contents of self._managed_partitions""" for p_name in self._managed_partitions: self._partitions[p_name]._parents = set() self._partitions[p_name]._children = set() for p in self._partitions.itervalues(): p._all_children = set() for p_name in self._managed_partitions: p = self._partitions[p_name] #Check the wiring dependencies of our children. #Touching those would be bad. --PMR # new_parents = [] # for par in p._parents: # for dep_name in par._wiring_conflicts: # if dep_name in self._managed_partitions: # new_parents.append(self._partitions[dep_name]) # p._parents.union(set(new_parents)) # # for child in p._children: # for dep_name in child._wiring_conflicts: # if dep_name in self._managed_partitions: # p._parents.add(self._partitions[dep_name]) # toss the wiring dependencies in with the parents for dep_name in p._wiring_conflicts: if dep_name in self._managed_partitions: p._parents.add(self._partitions[dep_name]) for other in self._partitions.itervalues(): if p.name == other.name: continue p_set = set(p.node_cards) other_set = set(other.node_cards) if other.name in self._managed_partitions: # if p is a subset of other, then p is a child; add other to p's list of managed parent partitions, and p to # other's list of managed child partitions if p_set.intersection(other_set) == p_set: p._parents.add(other) other._children.add(p) # if p contains other, then p is a parent; add other to p's list of managed child partitions and p to other's # list of managed parent partitions elif p_set.union(other_set) == p_set: p._children.add(other) other._parents.add(p) # if p contains other, then p is a parent; add other to p's list of all child partitions if p_set.union(other_set) == p_set: p._all_children.add(other) #Let's get the wiring conflicts for direct childeren as well, #we shouldn't be able to run on these either. --PMR for p_name in self._managed_partitions: #if p_name != "ANL-R10-R47-32768": # continue p = self._partitions[p_name] for child in p._children: #print "Child %s:" % child.name for dep_name in child._wiring_conflicts: #print "Conflict: %s" % dep_name if dep_name in self._managed_partitions: p._parents.add(self._partitions[dep_name]) #we shouldn't be scheduling on the parents of our children either for par in child._parents: #print "Parent: %s" % par.name if ((par.name != p_name) and (par.name in self._managed_partitions)): p._parents.add(self._partitions[par.name]) #for p_name in self._managed_partitions: #if p_name != "ANL-R10-R47-32768": # continue # print str(p_name) + ":" # print "Parents: " + str(":".join([par.name for par in self._partitions[p_name]._parents])) # print "Children:" + str(":".join([child.name for child in self._partitions[p_name]._children])) # print "Conflicts:" + str(":".join([con for con in self._partitions[p_name]._wiring_conflicts])) def validate_job(self, spec): """validate a job for submission Arguments: spec -- job specification dictionary """ # spec has {nodes, walltime*, procs, mode, kernel} max_nodes = max([int(p.size) for p in self._partitions.values()]) try: sys_type = CP.get('bgsystem', 'bgtype') except: sys_type = 'bgl' if sys_type == 'bgp': job_types = ['smp', 'dual', 'vn', 'script'] else: job_types = ['co', 'vn', 'script'] try: spec['nodecount'] = int(spec['nodecount']) except: raise JobValidationError("Non-integer node count") if not 0 < spec['nodecount'] <= max_nodes: raise JobValidationError("Node count out of realistic range") if float(spec['time']) < 5: raise JobValidationError("Walltime less than minimum") if not spec['mode']: if sys_type == 'bgp': spec['mode'] = 'smp' else: spec['mode'] = 'co' if spec['mode'] not in job_types: raise JobValidationError("Invalid mode") if spec['attrs'].has_key("location"): p_name = spec['attrs']['location'] if not self.partitions.has_key(p_name): raise JobValidationError("Partition %s not found" % p_name) if not spec['proccount']: if spec.get('mode', 'co') == 'vn': if sys_type == 'bgl': spec['proccount'] = str(2 * int(spec['nodecount'])) elif sys_type == 'bgp': spec['proccount'] = str(4 * int(spec['nodecount'])) else: self.logger.error("Unknown bgtype %s" % (sys_type)) elif spec.get('mode', 'co') == 'dual': spec['proccount'] = 2 * int(spec['nodecount']) else: spec['proccount'] = spec['nodecount'] else: try: spec['proccount'] = int(spec['proccount']) except: JobValidationError("non-integer proccount") if spec['proccount'] < 1: raise JobValidationError("negative proccount") if spec['proccount'] > spec['nodecount']: if spec['mode'] not in ['vn', 'dual']: raise JobValidationError("proccount too large") if sys_type == 'bgl' and (spec['proccount'] > (2 * spec['nodecount'])): raise JobValidationError("proccount too large") elif sys_type == ' bgp' and (spec['proccount'] > (4 * spec['nodecount'])): raise JobValidationError("proccount too large") # need to handle kernel return spec validate_job = exposed(validate_job) def run_diags(self, partition_list, test_name, user_name=None): self.logger.info("%s running diags %s on partitions %s", user_name, test_name, partition_list) def size_cmp(left, right): return -cmp(left.size, right.size) def _find_covering(partition): kids = [self._partitions[c_name] for c_name in partition.children] kids.sort(size_cmp) n = len(kids) part_node_cards = set(partition.node_cards) # generate the power set, but try to use the big partitions first (hence the sort above) for i in xrange(1, 2**n + 1): test_cover = [kids[j] for j in range(n) if i & 2**j] test_node_cards = set() for t in test_cover: test_node_cards.update(t.node_cards) if test_node_cards.issubset( part_node_cards) and test_node_cards.issuperset( part_node_cards): return test_cover return [] def _run_diags(partition): covering = _find_covering(partition) for child in covering: self.pending_diags[child] = test_name return [child.name for child in covering] results = [] for partition_name in partition_list: p = self._partitions[partition_name] results.append(_run_diags(p)) return results run_diags = exposed(run_diags) def launch_diags(self, partition, test_name): '''override this method in derived classes!''' pass def finish_diags(self, partition, test_name, exit_value): '''call this method somewhere in your derived class where you deal with the exit values of diags''' if exit_value == 0: for dead in self.failed_diags[:]: if dead == partition.name or dead in partition.children: self.failed_diags.remove(dead) self.logger.info("removing %s from failed_diags list" % dead) else: if partition.children: self.run_diags([partition.name], test_name) else: self.failed_diags.append(partition.name) self.logger.info("adding %s to failed_diags list" % partition.name) def handle_pending_diags(self): for p in self.pending_diags.keys(): if p.state in [ "idle", "blocked by pending diags", "failed diags", "blocked by failed diags" ]: self.logger.info("launching diagnostics on %s" % p.name) self.launch_diags(p, self.pending_diags[p]) del self.pending_diags[p] handle_pending_diags = automatic(handle_pending_diags) def fail_partitions(self, specs, user_name=None): self.logger.info("%s failing partition %s", user_name, specs) parts = self.get_partitions(specs) if not parts: ret = "no matching partitions found\n" else: ret = "" for p in parts: if self.failed_diags.count(p.name) == 0: ret += "failing %s\n" % p.name self.failed_diags.append(p.name) else: ret += "%s is already marked as failing\n" % p.name return ret fail_partitions = exposed(fail_partitions) def unfail_partitions(self, specs, user_name=None): self.logger.info("%s unfailing partition %s", user_name, specs) parts = self.get_partitions(specs) if not parts: ret = "no matching partitions found\n" else: ret = "" for p in self.get_partitions(specs): if self.failed_diags.count(p.name): ret += "unfailing %s\n" % p.name self.failed_diags.remove(p.name) else: ret += "%s is not currently failing\n" % p.name return ret unfail_partitions = exposed(unfail_partitions) def _find_job_location(self, args, drain_partitions=set(), backfilling=False): jobid = args['jobid'] nodes = args['nodes'] queue = args['queue'] utility_score = args['utility_score'] walltime = args['walltime'] walltime_p = args.get('walltime_p', walltime) #*AdjEst* forbidden = args.get("forbidden", []) required = args.get("required", []) if walltime_prediction_enabled: # *Adj_Est* runtime_estimate = float(walltime_p) else: runtime_estimate = float(walltime) best_score = sys.maxint best_partition = None available_partitions = set() requested_location = None if args['attrs'].has_key("location"): requested_location = args['attrs']['location'] if required: # whittle down the list of required partitions to the ones of the proper size # this is a lot like the stuff in _build_locations_cache, but unfortunately, # reservation queues aren't assigned like real queues, so that code doesn't find # these for p_name in required: available_partitions.add(self.cached_partitions[p_name]) available_partitions.update( self.cached_partitions[p_name]._children) possible = set() for p in available_partitions: possible.add(p.size) desired_size = 0 job_nodes = int(nodes) for psize in sorted(possible): if psize >= job_nodes: desired_size = psize break for p in available_partitions.copy(): if p.size != desired_size: available_partitions.remove(p) elif p.name in self._not_functional_set: available_partitions.remove(p) elif requested_location and p.name != requested_location: available_partitions.remove(p) else: for p in self.possible_locations(nodes, queue): skip = False for bad_name in forbidden: if p.name == bad_name or bad_name in p.children or bad_name in p.parents: skip = True break if not skip: if (not requested_location) or (p.name == requested_location): available_partitions.add(p) available_partitions -= drain_partitions now = time.time() for partition in available_partitions: # if the job needs more time than the partition currently has available, look elsewhere if backfilling: if partition.reserved_by: #if the partition is reserved, we don't use predicted walltime to backfill runtime_estimate = float(walltime) if 60 * runtime_estimate > (partition.backfill_time - now): # *Adj_Est* continue if 60 * float(walltime) > (partition.backfill_time - now): continue if partition.state == "idle": # let's check the impact on partitions that would become blocked score = 0 for p in partition.parents: if self.cached_partitions[ p].state == "idle" and self.cached_partitions[ p].scheduled: score += 1 # the lower the score, the fewer new partitions will be blocked by this selection if score < best_score: best_score = score best_partition = partition if best_partition: return {jobid: [best_partition.name]} def _find_drain_partition(self, job): # if the user requested a particular partition, we only try to drain that one if job['attrs'].has_key("location"): target_name = job['attrs']['location'] return self.cached_partitions.get(target_name, None) drain_partition = None locations = self.possible_locations(job['nodes'], job['queue']) for p in locations: if not drain_partition: drain_partition = p else: if p.backfill_time < drain_partition.backfill_time: drain_partition = p if drain_partition: # don't try to drain for an entire weekend hours = (drain_partition.backfill_time - time.time()) / 3600.0 if hours > max_drain_hours: drain_partition = None return drain_partition def possible_locations(self, job_nodes, q_name): desired_size = 0 job_nodes = int(job_nodes) if self._defined_sizes.has_key(q_name): for psize in self._defined_sizes[q_name]: if psize >= job_nodes: desired_size = psize break if self._locations_cache.has_key(q_name): return self._locations_cache[q_name].get(desired_size, []) else: return [] # this function builds three things, namely a pair of dictionaries keyed by queue names, and a set of # partition names which are not functional # # self._defined_sizes maps queue names to an ordered list of partition sizes available in that queue # for all schedulable partitions (even if currently offline and not functional) # self._locations_cache maps queue names to dictionaries which map partition sizes to partition objects; # this structure will only contain partitions which are fully online, so we don't try to drain a # broken partition # self._not_functional_set contains names of partitions which are not functional (either themselves, or # a parent or child) def _build_locations_cache(self): per_queue = {} defined_sizes = {} not_functional_set = set() for target_partition in self.cached_partitions.itervalues(): usable = True if target_partition.name in self.offline_partitions: usable = False else: for part in self.cached_partitions.itervalues(): if not part.functional: not_functional_set.add(part.name) if target_partition.name in part.children or target_partition.name in part.parents: usable = False not_functional_set.add(target_partition.name) break for queue_name in target_partition.queue.split(":"): if not per_queue.has_key(queue_name): per_queue[queue_name] = {} if not defined_sizes.has_key(queue_name): defined_sizes[queue_name] = set() if target_partition.scheduled: defined_sizes[queue_name].add(target_partition.size) if target_partition.scheduled and target_partition.functional and usable: if not per_queue[queue_name].has_key( target_partition.size): per_queue[queue_name][target_partition.size] = [] per_queue[queue_name][target_partition.size].append( target_partition) for q_name in defined_sizes: defined_sizes[q_name] = sorted(defined_sizes[q_name]) self._defined_sizes = defined_sizes self._locations_cache = per_queue self._not_functional_set = not_functional_set def find_job_location(self, arg_list, end_times): best_partition_dict = {} if self.bridge_in_error: return {} self._partitions_lock.acquire() try: self.cached_partitions = copy.deepcopy(self.partitions) except: self.logger.error("error in copy.deepcopy", exc_info=True) return {} finally: self._partitions_lock.release() # build the cached_partitions structure first self._build_locations_cache() # first, figure out backfilling cutoffs per partition (which we'll also use for picking which partition to drain) job_end_times = {} for item in end_times: job_end_times[item[0][0]] = item[1] now = time.time() for p in self.cached_partitions.itervalues(): if p.state == "idle": p.backfill_time = now else: p.backfill_time = now + 5 * 60 p.draining = False for p in self.cached_partitions.itervalues(): if p.name in job_end_times: if job_end_times[p.name] > p.backfill_time: p.backfill_time = job_end_times[p.name] for parent_name in p.parents: parent_partition = self.cached_partitions[parent_name] if p.backfill_time > parent_partition.backfill_time: parent_partition.backfill_time = p.backfill_time for p in self.cached_partitions.itervalues(): if p.backfill_time == now: continue for child_name in p.children: child_partition = self.cached_partitions[child_name] if child_partition.backfill_time == now or child_partition.backfill_time > p.backfill_time: child_partition.backfill_time = p.backfill_time # first time through, try for starting jobs based on utility scores drain_partitions = set() for job in arg_list: partition_name = self._find_job_location(job, drain_partitions) if partition_name: best_partition_dict.update(partition_name) break location = self._find_drain_partition(job) if location is not None: for p_name in location.parents: drain_partitions.add(self.cached_partitions[p_name]) for p_name in location.children: drain_partitions.add(self.cached_partitions[p_name]) self.cached_partitions[p_name].draining = True drain_partitions.add(location) #self.logger.info("job %s is draining %s" % (winning_job['jobid'], location.name)) location.draining = True # the next time through, try to backfill, but only if we couldn't find anything to start if not best_partition_dict: # arg_list.sort(self._walltimecmp) for args in arg_list: partition_name = self._find_job_location(args, backfilling=True) if partition_name: self.logger.info("backfilling job %s" % args['jobid']) best_partition_dict.update(partition_name) break # reserve the stuff in the best_partition_dict, as those partitions are allegedly going to # be running jobs very soon # # also, this is the only part of finding a job location where we need to lock anything self._partitions_lock.acquire() try: for p in self.partitions.itervalues(): # push the backfilling info from the local cache back to the real objects p.draining = self.cached_partitions[p.name].draining p.backfill_time = self.cached_partitions[p.name].backfill_time for jobid, partition_list in best_partition_dict.iteritems(): part = self.partitions[partition_list[0]] # FIXME: use reserve_resources_until() here? --brt part.used_by = int(jobid) part.reserved_until = time.time() + 5 * 60 part.state = "allocated" for p in part._parents: if p.state == "idle": p.state = "blocked (%s)" % (part.name, ) for p in part._children: if p.state == "idle": p.state = "blocked (%s)" % (part.name, ) except: self.logger.error("error in find_job_location", exc_info=True) self._partitions_lock.release() return best_partition_dict find_job_location = locking(exposed(find_job_location)) def _walltimecmp(self, dict1, dict2): return -cmp(float(dict1['walltime']), float(dict2['walltime'])) def find_queue_equivalence_classes(self, reservation_dict, active_queue_names): equiv = [] for part in self.partitions.itervalues(): if part.functional and part.scheduled: part_active_queues = [] for q in part.queue.split(":"): if q in active_queue_names: part_active_queues.append(q) # go on to the next partition if there are no running # queues using this partition if not part_active_queues: continue found_a_match = False for e in equiv: if e['data'].intersection(part.node_card_names): e['queues'].update(part_active_queues) e['data'].update(part.node_card_names) found_a_match = True break if not found_a_match: equiv.append({ 'queues': set(part_active_queues), 'data': set(part.node_card_names), 'reservations': set() }) real_equiv = [] for eq_class in equiv: found_a_match = False for e in real_equiv: if e['queues'].intersection(eq_class['queues']): e['queues'].update(eq_class['queues']) e['data'].update(eq_class['data']) found_a_match = True break if not found_a_match: real_equiv.append(eq_class) equiv = real_equiv for eq_class in equiv: for res_name in reservation_dict: skip = True for p_name in reservation_dict[res_name].split(":"): p = self.partitions[p_name] if eq_class['data'].intersection(p.node_card_names): eq_class['reservations'].add(res_name) for dep_name in p._wiring_conflicts: if self.partitions.has_key(dep_name): if eq_class['data'].intersection( self.partitions[dep_name].node_card_names): eq_class['reservations'].add(res_name) break for key in eq_class: eq_class[key] = list(eq_class[key]) del eq_class['data'] return equiv find_queue_equivalence_classes = exposed(find_queue_equivalence_classes) def can_run(self, target_partition, node_count, partition_dict): if target_partition.state != "idle": return False desired = sys.maxint for part in partition_dict.itervalues(): if not part.functional: if target_partition.name in part.children or target_partition.name in part.parents: return False else: if part.scheduled: if int(node_count) <= int(part.size) < desired: desired = int(part.size) return target_partition.scheduled and target_partition.functional and int( target_partition.size) == desired def reserve_resources_until(self, location, new_time, jobid): rc = False partition_name = location[0] pg = self.process_groups.find_by_jobid(jobid) try: self._partitions_lock.acquire() used_by = self.partitions[partition_name].used_by if used_by == None: self.partitions[partition_name].used_by = jobid used_by = jobid if new_time: if used_by == jobid: self.partitions[partition_name].reserved_until = new_time self.partitions[partition_name].reserved_by = jobid self.logger.info( "job %s: partition '%s' now reserved until %s", jobid, partition_name, time.asctime(time.gmtime(new_time))) rc = True else: self.logger.error( "job %s wasn't allowed to update the reservation on partition %s (owner=%s)", jobid, partition_name, used_by) else: if used_by == jobid: self.partitions[partition_name].reserved_until = False self.partitions[partition_name].reserved_by = None self.logger.info( "reservation on partition '%s' has been removed", partition_name) rc = True else: self.logger.error( "job %s wasn't allowed to clear the reservation on partition %s (owner=%s)", jobid, partition_name, used_by) except: self.logger.exception( "an unexpected error occurred will adjusting the partition reservation time" ) finally: self._partitions_lock.release() return rc reserve_resources_until = exposed(reserve_resources_until)
class ScriptManager(Component): '''The ScriptManager supports the running of scripts on a BG machine''' name = 'script-manager' # A default logger for the class is placed here. # Assigning an instance-level logger is supported, # and expected in the case of multiple instances. logger = logging.getLogger("Cobalt.Components.ScriptManager") implementation = 'scriptm' def __init__(self, *args, **kwargs): """Initialize a new ServiceLocator. All arguments are passed to the component constructor. """ Component.__init__(self, *args, **kwargs) self.ignore = [] self.lastwait = 0 self.pgroups = ProcessGroupDict() self.zombie_mpi = {} def manage_children(self): for pgroup in self.zombie_mpi.keys(): if pgroup.FinishProcess(): del self.zombie_mpi[pgroup] self.lock.acquire() try: if (time.time() - self.lastwait) > 6: while True: try: self.lastwait = time.time() (pid, stat) = os.waitpid(-1, os.WNOHANG) except OSError: break if pid == 0: break pgrps = [ pgrp for pgrp in self.pgroups.itervalues() if pgrp.pid == pid ] if len(pgrps) == 0: self.logger.error( "Failed to locate process group for pid %s" % (pid)) elif len(pgrps) == 1: pgroup = pgrps[0] pgroup.exit_status = stat self.logger.info("Job %s/%s: ProcessGroup %s Finished with exit code %d. pid %s" % \ (pgroup.jobid, pgroup.user, pgroup.jobid, int(stat)/256, pgroup.pid)) if os.WIFSIGNALED(stat): self.logger.info("Job %s/%s: ProcessGroup %s received signal %s" % \ (pgroup.jobid, pgroup.user, pgroup.jobid, os.WTERMSIG(stat))) try: err = open(pgroup.cobalt_log_file, 'a') print >> err, "The script job exited after receiving signal %s" % os.WTERMSIG( stat) err.close() except IOError: self.logger.error( "Job %s/%s: ProcessGroup %s failed to update .error file" % (pgroup.jobid, pgroup.user, pgroup.jobid)) self.zombie_mpi[pgroup] = True else: self.logger.error( "Got more than one match for pid %s" % (pid)) except: # just to make sure we don't keep the lock forever self.logger.error("error in manage_children", exc_info=True) self.lock.release() manage_children = locking(automatic(manage_children)) def add_jobs(self, specs): '''Create new process group element''' self.logger.info("creating process group %r" % specs) return self.pgroups.q_add(specs) add_jobs = exposed(query(add_jobs)) def get_jobs(self, specs): '''query existing process group''' return self.pgroups.q_get(specs) get_jobs = exposed(query(get_jobs)) def wait_jobs(self, specs): '''Remove completed process group''' self.logger.info("removing process group %r" % specs) return self.pgroups.q_del(specs) wait_jobs = exposed(query(wait_jobs)) def signal_jobs(self, specs, sig): '''signal existing process group with specified signal''' ret = [] for spec in specs: self.logger.info("signaling process group %r with signal %r" % (spec, sig)) for pg in self.pgroups.itervalues(): if pg.id == int(spec['id']): ret.append(pg.Signal(sig)) # could not find pg, so return False return ret signal_jobs = exposed(signal_jobs) def SigChildHand(self, sig, frame): '''Dont Handle SIGCHLDs''' pass def invoke_mpi_from_script(self, spec): '''Invoke the real mpirun on behalf of a script being executed by the script manager.''' self.lock.acquire() try: jobs = self.pgroups.q_get([{ 'jobid': spec['jobid'], 'user': spec['user'] }]) except: # just make sure we don't keep the lock forever self.logger.error("error in invoke_mpi_from_script", exc_info=True) self.lock.release() if len(jobs) != 1: self.logger.error( "invoke_mpi_from_script matched more than one job with spec %r" % spec) return -1 else: jobs[0].invoke_mpi_from_script(spec) return jobs[0].mpi_system_id invoke_mpi_from_script = locking(exposed(invoke_mpi_from_script))