예제 #1
0
 def validate_job(self, spec):
     """
     Validates a job for submission
     -- will the job ever run under the current Heckle configuration?
     Steps:
         1)  Validate Kernel
         2)  Validate HW
         3)  Validate Job versus overall
     """
     LOGGER.debug( "System:Validate Job: Specs are %s" % spec )
     hiccup = HeckleConnector()
     try:
         kernel = spec['kernel']
         valid_kernel = hiccup.validkernel( kernel )
         if not valid_kernel:
             raise Exception("System:Validate Job: Bad Kernel")
     except:
         spec['kernel'] = 'default'
     try:
         valid_hw = hiccup.validhw( **spec['attrs'] )
         if not valid_hw:
             raise Exception(
             "System:Validate Job: Bad Hardware Specs: %s" % spec )
     except Exception as strec:
         raise Exception("System:Validate Job:  Validate Job: %s" % strec)
     #try:
         #valid_job = hiccup.validjob( **spec )
         #if not valid_job:
             #raise Exception(
             #"System: validate Job:  Never enough nodes")
     #except:
         #raise Exception("System: validate Job: Never enough nodes")
     return spec
예제 #2
0
    def get_partitions(self, locations):
        """
        Work-around to get the cqadm to run a single job on this system
        PRE:  locations is a list of dict of strings of possible node names
        POST:  if good, return locations
                if not good, raise exception and list bad nodes
        """
        logstr = "System:get_partition: "
        hiccup = HeckleConnector()
        heckle_node_set = set(hiccup.list_all_nodes())
        locs = locations[0]['name']
        LOGGER.debug( logstr + "raw is are: %s" % locations )
        LOGGER.debug( logstr + "vals are: %s" % locs )
        if type(locs) == ListType:
            locset = set(locs)
            badlocations = locset.difference(heckle_node_set)
            if badlocations:
                raise Exception(
        logstr + "Bad Locations: %s " % list(badlocations) )
        elif type(locs) == StringType:
            if locs not in locations:
                raise Exception( logstr + "Bad Locations: %s" % locs)
        else:
            raise Exception( logstr + 
"location needs to be string or list of strings, you provided %s : %s" \
% ( type(locs), locs))
        return locations
예제 #3
0
 def validate_job(self, spec):
     """
     Validates a job for submission
     -- will the job ever run under the current Heckle configuration?
     Steps:
         1)  Validate Kernel
         2)  Validate HW
         3)  Validate Job versus overall
     """
     LOGGER.debug("System:Validate Job: Specs are %s" % spec)
     hiccup = HeckleConnector()
     try:
         kernel = spec['kernel']
         valid_kernel = hiccup.validkernel(kernel)
         if not valid_kernel:
             raise Exception("System:Validate Job: Bad Kernel")
     except:
         spec['kernel'] = 'default'
     try:
         valid_hw = hiccup.validhw(**spec['attrs'])
         if not valid_hw:
             raise Exception("System:Validate Job: Bad Hardware Specs: %s" %
                             spec)
     except Exception as strec:
         raise Exception("System:Validate Job:  Validate Job: %s" % strec)
     #try:
     #valid_job = hiccup.validjob( **spec )
     #if not valid_job:
     #raise Exception(
     #"System: validate Job:  Never enough nodes")
     #except:
     #raise Exception("System: validate Job: Never enough nodes")
     return spec
예제 #4
0
    def get_partitions(self, locations):
        """
        Work-around to get the cqadm to run a single job on this system
        PRE:  locations is a list of dict of strings of possible node names
        POST:  if good, return locations
                if not good, raise exception and list bad nodes
        """
        logstr = "System:get_partition: "
        hiccup = HeckleConnector()
        heckle_node_set = set(hiccup.list_all_nodes())
        locs = locations[0]['name']
        LOGGER.debug(logstr + "raw is are: %s" % locations)
        LOGGER.debug(logstr + "vals are: %s" % locs)
        if type(locs) == ListType:
            locset = set(locs)
            badlocations = locset.difference(heckle_node_set)
            if badlocations:
                raise Exception(logstr +
                                "Bad Locations: %s " % list(badlocations))
        elif type(locs) == StringType:
            if locs not in locations:
                raise Exception(logstr + "Bad Locations: %s" % locs)
        else:
            raise Exception( logstr +
                             "location needs to be string or list of strings, you provided %s : %s" \
% ( type(locs), locs))
        return locations
예제 #5
0
 def find_job_location(self, job_location_args, end_times):
     """
     Finds a group of not-busy nodes in which to run the job
     
     Arguments:
         job_location_args -- A list of dictionaries with info about the job
             jobid -- string identifier
             nodes -- int number of nodes
             queue -- string queue name
             required -- ??
             utility_score -- ??
             threshold -- ??
             walltime -- ??
             attrs -- dictionary of attributes to match against
         end_times -- supposed time the job will end
         
     Returns: Dictionary with list of nodes a job can run on, keyed by jobid
     """
     LOGGER.debug("System:find_job_location" )
     locations = {}
     def jobsort(job):
         """Used to sort job list by utility score"""
         return job["utility_score"]
     job_location_args.sort(key=jobsort)
     
     #Try to match jobs to nodes which can run them
     hiccup = HeckleConnector()
     for job in job_location_args:
         if "attrs" not in job or job["attrs"] is None:
             job["attrs"] = {}
         print "Job is %s" % job
         tempjob = job.copy()
         if self.hacky_forbidden_nodes:
             if 'forbidden' not in tempjob.keys():
                 tempjob['forbidden'] = self.hacky_forbidden_nodes
             else:
                 tempjob['forbidden'].extend( self.hacky_forbidden_nodes )
         #############################
         ###  Look at this as point of change
         ###  Think:  For node in unreserved nodes
         ###            Choose node from list
         ###            Remove node from unreserved nodes
         #############################
         try:
             resources = hiccup.find_job_location(**job)  #get matching nodes
             if not resources:
                 continue
         except Exception as err:
             LOGGER.info("System:find_job_location: Error %s" % err)
             continue
         node_list = []
         # Build a list of appropriate nodes
         for node in resources:
             node_list.append(node)
             self.hacky_forbidden_nodes.append(node)
         locations[job["jobid"]] = node_list
     LOGGER.info("System:find_job_location: locations are %s" % locations )
     return locations
예제 #6
0
 def _release_resources(self, pgp):
     """
     Releases all the Heckle nodes, unreserving them
     """
     LOGGER.debug( "System:release" )
     LOGGER.debug( "System:Locations are: %s" % pgp.location )
     hiccup = HeckleConnector()
     hiccup.free_reserved_node( uid = pgp.uid, node_list=pgp.location )
     try:
         del( self.hacky_forbidden_nodes[pgp.location] )
     except:
         pass
예제 #7
0
 def _release_resources(self, pgp):
     """
     Releases all the Heckle nodes, unreserving them
     """
     LOGGER.debug("System:release")
     LOGGER.debug("System:Locations are: %s" % pgp.location)
     hiccup = HeckleConnector()
     hiccup.free_reserved_node(uid=pgp.uid, node_list=pgp.location)
     try:
         del (self.hacky_forbidden_nodes[pgp.location])
     except:
         pass
예제 #8
0
 def _check_builds_done(self):
     """
     Check to see if the nodes are done building
     Starts the process group if all nodes in them are done building
     """
     #LOGGER.debug( "System:Check Build Done: Waiting to Start..." )
     #sleep(20)
     exstr = "System:check_build_done:"
     retval = True
     pg_list = [x for x in self.process_groups.itervalues()\
     if (len(x.pinging_nodes) > 0)]
     hiccup = HeckleConnector()
     for pgp in pg_list:
         for nodename in pgp.pinging_nodes:
             teststr = hiccup.get_node_bootstate(nodename)
             if teststr == "READY":
                 if 'fakebuild' in pgp.__dict__ and pgp.fakebuild:
                     pgp.pinging_nodes.remove(nodename)
                     LOGGER.debug( exstr + "Node %s done building; "\
                          + "%s pinging nodes left" %\
                          ( nodename, len(pgp.pinging_nodes)-1 ) )
                 else:
                     LOGGER.debug( exstr + "Node %s not done yet" %\
                                       nodename )
             if teststr == "COMPLETED":
                 LOGGER.debug( exstr +
                      "Removing node %s...%i pinging nodes left" \
                           % (nodename, len(pgp.pinging_nodes)-1) )
                 pgp.pinging_nodes.remove(nodename)
             elif teststr in ["BOOTING", "", ""]:
                 LOGGER.debug(exstr + "Node %s not done yet." % nodename)
             elif teststr == "UNALLOCATED":
                 raise Exception(
                     exstr +
                     "Node 'UNALLOCATED'; Possible build error, or system timed out."
                 )
             elif teststr == "CRITFAIL":
                 raise Exception(
                     exstr +
                     "Node says, 'CRITFAIL'.  It timed out while building.")
             #####################
             ####     Need to figure a better way to fail gracefully
             #####################
         if len(pgp.pinging_nodes) == 0:
             LOGGER.debug(
             "System:Check Build Done: No Pinging Nodes left, Start PG %s Running." \
     % pgp.jobid)
             pgp.start()
         else:
             retval = False
     return retval
예제 #9
0
 def get_resources(self, specs=None ):
     """
     Returns a list of free resources (nodes) which match the given specs.
     Specs is a dict which describes a job
     """
     LOGGER.debug( "System:get Resources" )
     ##################################
     ###  Look at this as a future change
     ##################################
     hiccup = HeckleConnector()
     if not specs:
         return hiccup.node_list
     else:
         return hiccup.list_available_nodes( **specs )
예제 #10
0
 def get_resources(self, specs=None):
     """
     Returns a list of free resources (nodes) which match the given specs.
     Specs is a dict which describes a job
     """
     LOGGER.debug("System:get Resources")
     ##################################
     ###  Look at this as a future change
     ##################################
     hiccup = HeckleConnector()
     if not specs:
         return hiccup.node_list
     else:
         return hiccup.list_available_nodes(**specs)
예제 #11
0
 def _check_builds_done(self):
     """
     Check to see if the nodes are done building
     Starts the process group if all nodes in them are done building
     """
     #LOGGER.debug( "System:Check Build Done: Waiting to Start..." )
     #sleep(20)
     exstr = "System:check_build_done:"
     retval = True
     pg_list = [x for x in self.process_groups.itervalues()\
     if (len(x.pinging_nodes) > 0)]
     hiccup = HeckleConnector()
     for pgp in pg_list:
         for nodename in pgp.pinging_nodes:
             teststr = hiccup.get_node_bootstate(nodename)
             if teststr == "READY":
                 if 'fakebuild' in pgp.__dict__ and pgp.fakebuild:
                     pgp.pinging_nodes.remove(nodename)
                     LOGGER.debug( exstr + "Node %s done building; "\
                          + "%s pinging nodes left" %\
                          ( nodename, len(pgp.pinging_nodes)-1 ) )
                 else:
                     LOGGER.debug( exstr + "Node %s not done yet" %\
                                       nodename )
             if  teststr == "COMPLETED":
                 LOGGER.debug( exstr + 
                      "Removing node %s...%i pinging nodes left" \
                           % (nodename, len(pgp.pinging_nodes)-1) )
                 pgp.pinging_nodes.remove(nodename)
             elif teststr in ["BOOTING", "", ""]:
                 LOGGER.debug( exstr +
                 "Node %s not done yet." % nodename)
             elif teststr == "UNALLOCATED":
                 raise Exception( exstr +
     "Node 'UNALLOCATED'; Possible build error, or system timed out.")
             elif teststr == "CRITFAIL":
                 raise Exception( exstr +
             "Node says, 'CRITFAIL'.  It timed out while building.")
             #####################
             ####     Need to figure a better way to fail gracefully
             #####################
         if len(pgp.pinging_nodes) == 0:
             LOGGER.debug( 
 "System:Check Build Done: No Pinging Nodes left, Start PG %s Running." \
     % pgp.jobid)
             pgp.start()
         else:
             retval = False
     return retval
예제 #12
0
 def verify_locations(self, location_list):
     """
     Makes sure a location list is valid
     location list is a list of fully qualified strings of node names
     ex:  nodename.mcs.anl.gov
     """
     LOGGER.debug("System:validate Job: Verify Locations")
     hiccup = HeckleConnector()
     heckle_set = set(hiccup.list_all_nodes())
     location_set = set(location_list)
     if heckle_set >= location_set:
         return location_list
     else:
         not_valid_list = list( location_set.difference( heckle_set ) )
         raise Exception(
 "System:VerifyLocations: Invalid location names: %s" % not_valid_list)
예제 #13
0
 def __init__(self, spec):
     logstr = "ProcessGroup:__INIT__:"
     LOGGER.debug(logstr + "Spec is: %s " % spec)
     ProcessGroup.__init__(self, spec, LOGGER)
     hiccup = HeckleConnector()
     self.location = spec["location"][:]
     self.pinging_nodes = spec["location"][:]
     print "&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&\n Location is: %s, %s, %s\n&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&" % (
         self.location,
         self.pinging_nodes,
         spec["location"],
     )
     # Set up process group attributes
     if not spec["kernel"]:
         spec["kernel"] = "default"
     self.kernel = spec["kernel"]
     self.user = self.uid = spec["user"]
     self.resource_attributes = {}
     for loc in self.location:
         self.resource_attributes[loc] = hiccup.get_node_properties(loc)
     print "The environment variables at this point are: %s" % spec["env"]
     try:
         temp_env = spec["env"]["data"]
         del (spec["env"]["data"])
         spec["env"].update(temp_env)
     except:
         pass
     try:  #  Checking for Fakebuild
         spec["fakebuild"] = spec["env"]["fakebuild"]
         del spec["env"]["fakebuild"]
     except:
         spec["fakebuild"] = False
     self.env = spec["env"]
     # Write nodefile
     self.nodefile = tempfile.mkstemp()
     print "\n\n\n\n\nNodefile is: %s\n\n\n\n\n" % self.nodefile[1]
     os.write(self.nodefile[0], " ".join(self.location))
     os.chmod(self.nodefile[1], stat.S_IRUSR | stat.S_IWUSR | stat.S_IRGRP | stat.S_IROTH)
     os.close(self.nodefile[0])
     # Make heckle reservation
     res_attrs = ["location", "kernel", "walltime", "user", "fakebuild", "comment"]
     res_args = {}
     for attr in spec:
         res_args[attr] = spec[attr]
     reservation = hiccup.make_reservation(res_args)
     self.heckle_res_id = reservation.id
예제 #14
0
 def verify_locations(self, location_list):
     """
     Makes sure a location list is valid
     location list is a list of fully qualified strings of node names
     ex:  nodename.mcs.anl.gov
     """
     LOGGER.debug("System:validate Job: Verify Locations")
     hiccup = HeckleConnector()
     heckle_set = set(hiccup.list_all_nodes())
     location_set = set(location_list)
     if heckle_set >= location_set:
         return location_list
     else:
         not_valid_list = list(location_set.difference(heckle_set))
         raise Exception(
             "System:VerifyLocations: Invalid location names: %s" %
             not_valid_list)
예제 #15
0
 def __init__(self, spec):
     logstr = "ProcessGroup:__INIT__:"
     LOGGER.debug(logstr + "Spec is: %s " % spec)
     ProcessGroup.__init__(self, spec, LOGGER)
     hiccup = HeckleConnector()
     self.location = spec['location'][:]
     self.pinging_nodes = spec['location'][:]
     print "&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&\n Location is: %s, %s, %s\n&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&" % (
         self.location, self.pinging_nodes, spec['location'])
     # Set up process group attributes
     if not spec['kernel']:
         spec['kernel'] = "default"
     self.kernel = spec['kernel']
     self.user = self.uid = spec['user']
     self.resource_attributes = {}
     for loc in self.location:
         self.resource_attributes[loc] = hiccup.get_node_properties(loc)
     print "The environment variables at this point are: %s" % spec['env']
     try:
         temp_env = spec['env']['data']
         del (spec['env']['data'])
         spec['env'].update(temp_env)
     except:
         pass
     try:  #  Checking for Fakebuild
         spec['fakebuild'] = spec['env']['fakebuild']
         del spec['env']['fakebuild']
     except:
         spec['fakebuild'] = False
     self.env = spec['env']
     # Write nodefile
     self.nodefile = tempfile.mkstemp()
     print "\n\n\n\n\nNodefile is: %s\n\n\n\n\n" % self.nodefile[1]
     os.write(self.nodefile[0], " ".join(self.location))
     os.chmod(self.nodefile[1], stat.S_IRUSR|stat.S_IWUSR|stat.S_IRGRP| \
     stat.S_IROTH)
     os.close(self.nodefile[0])
     # Make heckle reservation
     res_attrs = ['location', 'kernel', 'walltime', 'user', 'fakebuild'\
         , 'comment']
     res_args = {}
     for attr in spec:
         res_args[attr] = spec[attr]
     reservation = hiccup.make_reservation(res_args)
     self.heckle_res_id = reservation.id
예제 #16
0
    def find_job_location(self, job_location_args, end_times):
        """
        Finds a group of not-busy nodes in which to run the job
        
        Arguments:
            job_location_args -- A list of dictionaries with info about the job
                jobid -- string identifier
                nodes -- int number of nodes
                queue -- string queue name
                required -- ??
                utility_score -- ??
                threshold -- ??
                walltime -- ??
                attrs -- dictionary of attributes to match against
            end_times -- supposed time the job will end
            
        Returns: Dictionary with list of nodes a job can run on, keyed by jobid
        """
        LOGGER.debug("System:find_job_location")
        locations = {}

        def jobsort(job):
            """Used to sort job list by utility score"""
            return job["utility_score"]

        job_location_args.sort(key=jobsort)

        #Try to match jobs to nodes which can run them
        hiccup = HeckleConnector()
        for job in job_location_args:
            if "attrs" not in job or job["attrs"] is None:
                job["attrs"] = {}
            print "Job is %s" % job
            tempjob = job.copy()
            if self.hacky_forbidden_nodes:
                if 'forbidden' not in tempjob.keys():
                    tempjob['forbidden'] = self.hacky_forbidden_nodes
                else:
                    tempjob['forbidden'].extend(self.hacky_forbidden_nodes)
            #############################
            ###  Look at this as point of change
            ###  Think:  For node in unreserved nodes
            ###            Choose node from list
            ###            Remove node from unreserved nodes
            #############################
            try:
                resources = hiccup.find_job_location(**
                                                     job)  #get matching nodes
                if not resources:
                    continue
            except Exception as err:
                LOGGER.info("System:find_job_location: Error %s" % err)
                continue
            node_list = []
            # Build a list of appropriate nodes
            for node in resources:
                node_list.append(node)
                self.hacky_forbidden_nodes.append(node)
            locations[job["jobid"]] = node_list
        LOGGER.info("System:find_job_location: locations are %s" % locations)
        return locations