def validate_job(self, spec): """ Validates a job for submission -- will the job ever run under the current Heckle configuration? Steps: 1) Validate Kernel 2) Validate HW 3) Validate Job versus overall """ LOGGER.debug( "System:Validate Job: Specs are %s" % spec ) hiccup = HeckleConnector() try: kernel = spec['kernel'] valid_kernel = hiccup.validkernel( kernel ) if not valid_kernel: raise Exception("System:Validate Job: Bad Kernel") except: spec['kernel'] = 'default' try: valid_hw = hiccup.validhw( **spec['attrs'] ) if not valid_hw: raise Exception( "System:Validate Job: Bad Hardware Specs: %s" % spec ) except Exception as strec: raise Exception("System:Validate Job: Validate Job: %s" % strec) #try: #valid_job = hiccup.validjob( **spec ) #if not valid_job: #raise Exception( #"System: validate Job: Never enough nodes") #except: #raise Exception("System: validate Job: Never enough nodes") return spec
def get_partitions(self, locations): """ Work-around to get the cqadm to run a single job on this system PRE: locations is a list of dict of strings of possible node names POST: if good, return locations if not good, raise exception and list bad nodes """ logstr = "System:get_partition: " hiccup = HeckleConnector() heckle_node_set = set(hiccup.list_all_nodes()) locs = locations[0]['name'] LOGGER.debug( logstr + "raw is are: %s" % locations ) LOGGER.debug( logstr + "vals are: %s" % locs ) if type(locs) == ListType: locset = set(locs) badlocations = locset.difference(heckle_node_set) if badlocations: raise Exception( logstr + "Bad Locations: %s " % list(badlocations) ) elif type(locs) == StringType: if locs not in locations: raise Exception( logstr + "Bad Locations: %s" % locs) else: raise Exception( logstr + "location needs to be string or list of strings, you provided %s : %s" \ % ( type(locs), locs)) return locations
def validate_job(self, spec): """ Validates a job for submission -- will the job ever run under the current Heckle configuration? Steps: 1) Validate Kernel 2) Validate HW 3) Validate Job versus overall """ LOGGER.debug("System:Validate Job: Specs are %s" % spec) hiccup = HeckleConnector() try: kernel = spec['kernel'] valid_kernel = hiccup.validkernel(kernel) if not valid_kernel: raise Exception("System:Validate Job: Bad Kernel") except: spec['kernel'] = 'default' try: valid_hw = hiccup.validhw(**spec['attrs']) if not valid_hw: raise Exception("System:Validate Job: Bad Hardware Specs: %s" % spec) except Exception as strec: raise Exception("System:Validate Job: Validate Job: %s" % strec) #try: #valid_job = hiccup.validjob( **spec ) #if not valid_job: #raise Exception( #"System: validate Job: Never enough nodes") #except: #raise Exception("System: validate Job: Never enough nodes") return spec
def get_partitions(self, locations): """ Work-around to get the cqadm to run a single job on this system PRE: locations is a list of dict of strings of possible node names POST: if good, return locations if not good, raise exception and list bad nodes """ logstr = "System:get_partition: " hiccup = HeckleConnector() heckle_node_set = set(hiccup.list_all_nodes()) locs = locations[0]['name'] LOGGER.debug(logstr + "raw is are: %s" % locations) LOGGER.debug(logstr + "vals are: %s" % locs) if type(locs) == ListType: locset = set(locs) badlocations = locset.difference(heckle_node_set) if badlocations: raise Exception(logstr + "Bad Locations: %s " % list(badlocations)) elif type(locs) == StringType: if locs not in locations: raise Exception(logstr + "Bad Locations: %s" % locs) else: raise Exception( logstr + "location needs to be string or list of strings, you provided %s : %s" \ % ( type(locs), locs)) return locations
def find_job_location(self, job_location_args, end_times): """ Finds a group of not-busy nodes in which to run the job Arguments: job_location_args -- A list of dictionaries with info about the job jobid -- string identifier nodes -- int number of nodes queue -- string queue name required -- ?? utility_score -- ?? threshold -- ?? walltime -- ?? attrs -- dictionary of attributes to match against end_times -- supposed time the job will end Returns: Dictionary with list of nodes a job can run on, keyed by jobid """ LOGGER.debug("System:find_job_location" ) locations = {} def jobsort(job): """Used to sort job list by utility score""" return job["utility_score"] job_location_args.sort(key=jobsort) #Try to match jobs to nodes which can run them hiccup = HeckleConnector() for job in job_location_args: if "attrs" not in job or job["attrs"] is None: job["attrs"] = {} print "Job is %s" % job tempjob = job.copy() if self.hacky_forbidden_nodes: if 'forbidden' not in tempjob.keys(): tempjob['forbidden'] = self.hacky_forbidden_nodes else: tempjob['forbidden'].extend( self.hacky_forbidden_nodes ) ############################# ### Look at this as point of change ### Think: For node in unreserved nodes ### Choose node from list ### Remove node from unreserved nodes ############################# try: resources = hiccup.find_job_location(**job) #get matching nodes if not resources: continue except Exception as err: LOGGER.info("System:find_job_location: Error %s" % err) continue node_list = [] # Build a list of appropriate nodes for node in resources: node_list.append(node) self.hacky_forbidden_nodes.append(node) locations[job["jobid"]] = node_list LOGGER.info("System:find_job_location: locations are %s" % locations ) return locations
def _release_resources(self, pgp): """ Releases all the Heckle nodes, unreserving them """ LOGGER.debug( "System:release" ) LOGGER.debug( "System:Locations are: %s" % pgp.location ) hiccup = HeckleConnector() hiccup.free_reserved_node( uid = pgp.uid, node_list=pgp.location ) try: del( self.hacky_forbidden_nodes[pgp.location] ) except: pass
def _release_resources(self, pgp): """ Releases all the Heckle nodes, unreserving them """ LOGGER.debug("System:release") LOGGER.debug("System:Locations are: %s" % pgp.location) hiccup = HeckleConnector() hiccup.free_reserved_node(uid=pgp.uid, node_list=pgp.location) try: del (self.hacky_forbidden_nodes[pgp.location]) except: pass
def _check_builds_done(self): """ Check to see if the nodes are done building Starts the process group if all nodes in them are done building """ #LOGGER.debug( "System:Check Build Done: Waiting to Start..." ) #sleep(20) exstr = "System:check_build_done:" retval = True pg_list = [x for x in self.process_groups.itervalues()\ if (len(x.pinging_nodes) > 0)] hiccup = HeckleConnector() for pgp in pg_list: for nodename in pgp.pinging_nodes: teststr = hiccup.get_node_bootstate(nodename) if teststr == "READY": if 'fakebuild' in pgp.__dict__ and pgp.fakebuild: pgp.pinging_nodes.remove(nodename) LOGGER.debug( exstr + "Node %s done building; "\ + "%s pinging nodes left" %\ ( nodename, len(pgp.pinging_nodes)-1 ) ) else: LOGGER.debug( exstr + "Node %s not done yet" %\ nodename ) if teststr == "COMPLETED": LOGGER.debug( exstr + "Removing node %s...%i pinging nodes left" \ % (nodename, len(pgp.pinging_nodes)-1) ) pgp.pinging_nodes.remove(nodename) elif teststr in ["BOOTING", "", ""]: LOGGER.debug(exstr + "Node %s not done yet." % nodename) elif teststr == "UNALLOCATED": raise Exception( exstr + "Node 'UNALLOCATED'; Possible build error, or system timed out." ) elif teststr == "CRITFAIL": raise Exception( exstr + "Node says, 'CRITFAIL'. It timed out while building.") ##################### #### Need to figure a better way to fail gracefully ##################### if len(pgp.pinging_nodes) == 0: LOGGER.debug( "System:Check Build Done: No Pinging Nodes left, Start PG %s Running." \ % pgp.jobid) pgp.start() else: retval = False return retval
def get_resources(self, specs=None ): """ Returns a list of free resources (nodes) which match the given specs. Specs is a dict which describes a job """ LOGGER.debug( "System:get Resources" ) ################################## ### Look at this as a future change ################################## hiccup = HeckleConnector() if not specs: return hiccup.node_list else: return hiccup.list_available_nodes( **specs )
def get_resources(self, specs=None): """ Returns a list of free resources (nodes) which match the given specs. Specs is a dict which describes a job """ LOGGER.debug("System:get Resources") ################################## ### Look at this as a future change ################################## hiccup = HeckleConnector() if not specs: return hiccup.node_list else: return hiccup.list_available_nodes(**specs)
def _check_builds_done(self): """ Check to see if the nodes are done building Starts the process group if all nodes in them are done building """ #LOGGER.debug( "System:Check Build Done: Waiting to Start..." ) #sleep(20) exstr = "System:check_build_done:" retval = True pg_list = [x for x in self.process_groups.itervalues()\ if (len(x.pinging_nodes) > 0)] hiccup = HeckleConnector() for pgp in pg_list: for nodename in pgp.pinging_nodes: teststr = hiccup.get_node_bootstate(nodename) if teststr == "READY": if 'fakebuild' in pgp.__dict__ and pgp.fakebuild: pgp.pinging_nodes.remove(nodename) LOGGER.debug( exstr + "Node %s done building; "\ + "%s pinging nodes left" %\ ( nodename, len(pgp.pinging_nodes)-1 ) ) else: LOGGER.debug( exstr + "Node %s not done yet" %\ nodename ) if teststr == "COMPLETED": LOGGER.debug( exstr + "Removing node %s...%i pinging nodes left" \ % (nodename, len(pgp.pinging_nodes)-1) ) pgp.pinging_nodes.remove(nodename) elif teststr in ["BOOTING", "", ""]: LOGGER.debug( exstr + "Node %s not done yet." % nodename) elif teststr == "UNALLOCATED": raise Exception( exstr + "Node 'UNALLOCATED'; Possible build error, or system timed out.") elif teststr == "CRITFAIL": raise Exception( exstr + "Node says, 'CRITFAIL'. It timed out while building.") ##################### #### Need to figure a better way to fail gracefully ##################### if len(pgp.pinging_nodes) == 0: LOGGER.debug( "System:Check Build Done: No Pinging Nodes left, Start PG %s Running." \ % pgp.jobid) pgp.start() else: retval = False return retval
def verify_locations(self, location_list): """ Makes sure a location list is valid location list is a list of fully qualified strings of node names ex: nodename.mcs.anl.gov """ LOGGER.debug("System:validate Job: Verify Locations") hiccup = HeckleConnector() heckle_set = set(hiccup.list_all_nodes()) location_set = set(location_list) if heckle_set >= location_set: return location_list else: not_valid_list = list( location_set.difference( heckle_set ) ) raise Exception( "System:VerifyLocations: Invalid location names: %s" % not_valid_list)
def __init__(self, spec): logstr = "ProcessGroup:__INIT__:" LOGGER.debug(logstr + "Spec is: %s " % spec) ProcessGroup.__init__(self, spec, LOGGER) hiccup = HeckleConnector() self.location = spec["location"][:] self.pinging_nodes = spec["location"][:] print "&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&\n Location is: %s, %s, %s\n&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&" % ( self.location, self.pinging_nodes, spec["location"], ) # Set up process group attributes if not spec["kernel"]: spec["kernel"] = "default" self.kernel = spec["kernel"] self.user = self.uid = spec["user"] self.resource_attributes = {} for loc in self.location: self.resource_attributes[loc] = hiccup.get_node_properties(loc) print "The environment variables at this point are: %s" % spec["env"] try: temp_env = spec["env"]["data"] del (spec["env"]["data"]) spec["env"].update(temp_env) except: pass try: # Checking for Fakebuild spec["fakebuild"] = spec["env"]["fakebuild"] del spec["env"]["fakebuild"] except: spec["fakebuild"] = False self.env = spec["env"] # Write nodefile self.nodefile = tempfile.mkstemp() print "\n\n\n\n\nNodefile is: %s\n\n\n\n\n" % self.nodefile[1] os.write(self.nodefile[0], " ".join(self.location)) os.chmod(self.nodefile[1], stat.S_IRUSR | stat.S_IWUSR | stat.S_IRGRP | stat.S_IROTH) os.close(self.nodefile[0]) # Make heckle reservation res_attrs = ["location", "kernel", "walltime", "user", "fakebuild", "comment"] res_args = {} for attr in spec: res_args[attr] = spec[attr] reservation = hiccup.make_reservation(res_args) self.heckle_res_id = reservation.id
def verify_locations(self, location_list): """ Makes sure a location list is valid location list is a list of fully qualified strings of node names ex: nodename.mcs.anl.gov """ LOGGER.debug("System:validate Job: Verify Locations") hiccup = HeckleConnector() heckle_set = set(hiccup.list_all_nodes()) location_set = set(location_list) if heckle_set >= location_set: return location_list else: not_valid_list = list(location_set.difference(heckle_set)) raise Exception( "System:VerifyLocations: Invalid location names: %s" % not_valid_list)
def __init__(self, spec): logstr = "ProcessGroup:__INIT__:" LOGGER.debug(logstr + "Spec is: %s " % spec) ProcessGroup.__init__(self, spec, LOGGER) hiccup = HeckleConnector() self.location = spec['location'][:] self.pinging_nodes = spec['location'][:] print "&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&\n Location is: %s, %s, %s\n&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&" % ( self.location, self.pinging_nodes, spec['location']) # Set up process group attributes if not spec['kernel']: spec['kernel'] = "default" self.kernel = spec['kernel'] self.user = self.uid = spec['user'] self.resource_attributes = {} for loc in self.location: self.resource_attributes[loc] = hiccup.get_node_properties(loc) print "The environment variables at this point are: %s" % spec['env'] try: temp_env = spec['env']['data'] del (spec['env']['data']) spec['env'].update(temp_env) except: pass try: # Checking for Fakebuild spec['fakebuild'] = spec['env']['fakebuild'] del spec['env']['fakebuild'] except: spec['fakebuild'] = False self.env = spec['env'] # Write nodefile self.nodefile = tempfile.mkstemp() print "\n\n\n\n\nNodefile is: %s\n\n\n\n\n" % self.nodefile[1] os.write(self.nodefile[0], " ".join(self.location)) os.chmod(self.nodefile[1], stat.S_IRUSR|stat.S_IWUSR|stat.S_IRGRP| \ stat.S_IROTH) os.close(self.nodefile[0]) # Make heckle reservation res_attrs = ['location', 'kernel', 'walltime', 'user', 'fakebuild'\ , 'comment'] res_args = {} for attr in spec: res_args[attr] = spec[attr] reservation = hiccup.make_reservation(res_args) self.heckle_res_id = reservation.id
def find_job_location(self, job_location_args, end_times): """ Finds a group of not-busy nodes in which to run the job Arguments: job_location_args -- A list of dictionaries with info about the job jobid -- string identifier nodes -- int number of nodes queue -- string queue name required -- ?? utility_score -- ?? threshold -- ?? walltime -- ?? attrs -- dictionary of attributes to match against end_times -- supposed time the job will end Returns: Dictionary with list of nodes a job can run on, keyed by jobid """ LOGGER.debug("System:find_job_location") locations = {} def jobsort(job): """Used to sort job list by utility score""" return job["utility_score"] job_location_args.sort(key=jobsort) #Try to match jobs to nodes which can run them hiccup = HeckleConnector() for job in job_location_args: if "attrs" not in job or job["attrs"] is None: job["attrs"] = {} print "Job is %s" % job tempjob = job.copy() if self.hacky_forbidden_nodes: if 'forbidden' not in tempjob.keys(): tempjob['forbidden'] = self.hacky_forbidden_nodes else: tempjob['forbidden'].extend(self.hacky_forbidden_nodes) ############################# ### Look at this as point of change ### Think: For node in unreserved nodes ### Choose node from list ### Remove node from unreserved nodes ############################# try: resources = hiccup.find_job_location(** job) #get matching nodes if not resources: continue except Exception as err: LOGGER.info("System:find_job_location: Error %s" % err) continue node_list = [] # Build a list of appropriate nodes for node in resources: node_list.append(node) self.hacky_forbidden_nodes.append(node) locations[job["jobid"]] = node_list LOGGER.info("System:find_job_location: locations are %s" % locations) return locations