Esempio n. 1
0
 def __init__(self, *args, **kwargs):
     Component.__init__(self, *args, **kwargs)
     self.process_groups = ProcessGroupDict()
     self.process_groups.item_cls = HeckleProcessGroup
     self.queue_assignments["default"] = self.get_resources()
     self.hacky_forbidden_nodes = [
     ]  #This is a temporary fix for the forbidden nodes issue
Esempio n. 2
0
 def __init__(self, *args, **kwargs):
     logger.debug(
         "heckle: System: init ... %s ... &&&&&&&&&&&&&&&&&&&&&&&&&&&&&  I am here as well &&&&&&&&&&&&&&&&&&&&&&&&&"
         % threading.current_thread().getName())
     Component.__init__(self, *args, **kwargs)
     self.process_groups = ProcessGroupDict()
     self.process_groups.item_cls = HeckleProcessGroup
     self.resources = ResourceDict()
     self.queue_assignments["default"] = self.resources.keys()
     print "\n\n\n\n"
     print "Queue assignments are: %s" % self.queue_assignments
Esempio n. 3
0
 def __init__(self, *args, **kwargs):
     logger.debug(
         "heckle: System: init ... %s ... &&&&&&&&&&&&&&&&&&&&&&&&&&&&&  I am here as well &&&&&&&&&&&&&&&&&&&&&&&&&"
         % threading.current_thread().getName()
     )
     Component.__init__(self, *args, **kwargs)
     self.process_groups = ProcessGroupDict()
     self.process_groups.item_cls = HeckleProcessGroup
     self.resources = ResourceDict()
     self.queue_assignments["default"] = self.resources.keys()
     print "\n\n\n\n"
     print "Queue assignments are: %s" % self.queue_assignments
Esempio n. 4
0
class HeckleSystem(Component):
    """
     Cobalt System component for handling / interacting with Heckle resource manager
     
     External Methods:
          add_process_groups -- allocates nodes
          get_process_groups -- get process groups based on specs
          signal_process_groups -- signal a process group
          wait_process_groups -- removed process groups based on specs
          
     Internal Methods:
          __init__:
          _start_pg:
          _check_builds_done:
          _wait:
          _release_resources:
          get_resources:
          
     Queue Manager Methods:
          validate_job:
          verify_locations:
          find_job_locations:
          find_queue_equivalence_classes:
     """

    name = "system"
    implementation = "HeckleBreadboard"
    queue_assignments = {}

    def __init__(self, *args, **kwargs):
        logger.debug(
            "heckle: System: init ... %s ... &&&&&&&&&&&&&&&&&&&&&&&&&&&&&  I am here as well &&&&&&&&&&&&&&&&&&&&&&&&&"
            % threading.current_thread().getName())
        Component.__init__(self, *args, **kwargs)
        self.process_groups = ProcessGroupDict()
        self.process_groups.item_cls = HeckleProcessGroup
        self.resources = ResourceDict()
        self.queue_assignments["default"] = self.resources.keys()
        print "\n\n\n\n"
        print "Queue assignments are: %s" % self.queue_assignments

    def __repr__(self):
        """
          printout representation of the class
          """
        indict = self.__dict__
        printstr = ""
        printstr += "Heckle System Object: Values"
        for element in indict:
            printstr += str(element) + "::"
            if indict[element] == None:
                printstr += "None, "
            else:
                printstr += str(indict[element]) + ", "
        printstr += "   Process Groups:"
        for element in self.process_groups:
            printstr += str(element) + "::" + str(
                self.process_groups[element]) + ", "
        return printstr

    #####################
    # Main set of methods
    #####################
    def add_process_groups(self, specs):
        """
          Allocate nodes and add the list of those allocated to the PGDict
          specs is a list of dictionaries
          Each dictionary contains the specifications for all the nodes in the process group
          """
        #Debug - Take out to really rebuild
        ####    Need to check the environment variable for fakebuild
        try:
            specs[0]['fakebuild'] = specs[0]['env']['fakebuild']
            del specs[0]['env']['fakebuild']
        except:
            pass
        print "Heckle System:  add_process_groups: <<<<<<<<<<<<<<<<<<          OK< Debug< This< :  %s" % specs
        HICCUP = HeckleConnector()
        #try:
        reservation = HICCUP.make_reservation(**(specs[0]))
        heckle_res_id = reservation.id
        uid = specs[0]['user']
        logger.debug("Heckle System: heckle_res_id = %i" % heckle_res_id)
        specs[0]['heckle_res_id'] = heckle_res_id
        return self.process_groups.q_add(
            specs, lambda x, _: self._start_pg(
                x, heckle_res_id=heckle_res_id, uid=uid))
        #except Exception as hec_except:
        ## could do something here about problems
        ##    1)  Kill job, then resubmit job w/o node name(s)
        ##         Would require access to cqadm via api
        ##    2)  Put job / node in fail state
        ##    3)  Simply fail
        #raise Exception("Heckle System Object: add_process_groups: %s" % hec_except)

    add_process_groups = exposed(query(add_process_groups))

    def get_process_groups(self, specs):
        """get a list of existing allocations"""
        #logger.debug( "Heckle System: get_process_groups" )
        self._wait()
        return self.process_groups.q_get(specs)

    get_process_groups = exposed(query(get_process_groups))

    def signal_process_groups(self, specs, sig):
        """Free the specified process group (set of allocated nodes)"""
        logger.debug(
            "Heckle System: signal_process_groups: Specs are %s, sig is %s" %
            (specs, sig))
        return self.process_groups.q_get(specs, lambda x, y: x.signal(y), sig)

    signal_process_groups = exposed(query(signal_process_groups))

    def wait_process_groups(self, specs):
        """Remove terminated process groups"""
        logger.debug("Heckle System: wait_process_groups; specs are %s" %
                     specs)
        return self.process_groups.q_del(
            specs, lambda x, _: self._release_resources(x))

    wait_process_groups = exposed(query(wait_process_groups))

    #########################################
    # Methods for dealing with Process Groups
    #########################################

    def _start_pg(self, pgp, heckle_res_id, uid):
        """
          Populates the process group with its resources
               gets node information for nodes in process group
               Updates those attributes
               Places nodes in the pinging nodes list, to see if they're built
          """
        logger.debug("Heckle System: start_pg: PGP is %s" % pgp)
        nodelist = pgp.location
        for node in nodelist:
            node_attributes = self.resources[node]
            node_attributes['mac'] = node_attributes['mac'].replace("-", ":")
            node_attributes['heckle_res_id'] = heckle_res_id
            pgp.resource_attributes[node] = node_attributes._get_dict()
        pgp.uid = uid
        pgp.pinging_nodes.append(nodelist)

    add_process_groups = exposed(query(add_process_groups))

    def _check_builds_done(self):
        """
          Check to see if the nodes are done building
          Starts the process group if all nodes in them are done building
          """
        #logger.debug( "heckle: System: Check Build Done: Waiting to Start..." )
        #sleep(20)
        retval = True
        pg_list = [
            x for x in self.process_groups.itervalues()
            if (len(x.pinging_nodes) > 0)
        ]
        self.resources.update()
        for pgp in pg_list:
            for nodename in pgp.pinging_nodes:
                teststr = self.resources[nodename]['bootstate']
                if teststr == "COMPLETED":
                    logger.debug(
                        "heckle: System: Check Build Done: Removing node %s...%i pinging nodes left"
                        % (nodename, len(pgp.pinging_nodes) - 1))
                    pgp.pinging_nodes.remove(nodename)
                elif teststr in ["BOOTING", "", ""]:
                    logger.debug(
                        "Heckle System: Check Build Done: Node %s not done yet."
                        % nodename)
                elif teststr == "UNALLOCATED":
                    raise Exception(
                        "HIC_SO: _check_builds_done: Node says, 'UNALLOCATED'.  Possible build error, or system timed out."
                    )
                elif teststr == "CRITFAIL":
                    raise Exception(
                        "HIC_SO: _check_builds_done: Node says, 'CRITFAIL'.  It timed out while building."
                    )
                    #####################
                    ####      Need to figure a better way to fail gracefully on this one...
                    #####################
                elif teststr == "READY":
                    raise Exception(
                        "HIC_SO: _check_builds_done: Node says, 'READY'.  The Heckle Reservation is already ready already, skipping pinging."
                    )
            if len(pgp.pinging_nodes) == 0:
                logger.debug(
                    "Heckle System: Check Build Done: No Pinging Nodes left, Start PG Running."
                )
                pgp.start()
            else:
                retval = False
        return retval

    _check_builds_done = automatic(_check_builds_done)

    def _wait(self):
        """
          Calls the process group container's wait() method
          """
        #logger.debug( "Heckle System: wait" )
        for pgp in self.process_groups.itervalues():
            pgp.wait()

    _wait = automatic(_wait)

    def _release_resources(self, pgp):
        """
          Releases all the Heckle nodes, unreserving them
          """
        logger.debug("Heckle System: Release %s" % pgp.location)
        #self.resources[pgp.location]['action']='Free'
        HICCUP = HeckleConnector()
        HICCUP.free_reserved_node(uid=pgp.uid, node_list=pgp.location)
        self.resources.free(nodes=pgp.location)

    def get_resources(self, specs={}):
        """
          Returns a list of names for all the FREE resources (nodes) which match the given specs.
          """
        logger.debug("Heckle System: get Resources, specs are %s" % specs)
        ##################################
        ###  Look at this as a future change
        ##################################
        specs['current reservation'] = 9999999
        specs['allocatable'] = 'True'
        res_list = self.resources >= specs
        logger.debug("Heckle System: get Resources, resources are %s" %
                     res_list)
        return res_list

    get_resources = exposed(query(get_resources))

    ##########################################################
    # Methods for interacting with scheduler and queue-manager
    ##########################################################

    def validate_job(self, spec):
        """
          Validates a job for submission
          -- will the job ever run under the current Heckle configuration?
          Steps:
               1)  Validate Kernel
               2)  Validate HW
               3)  Validate Job versus overall
          """
        logger.debug("Heckle System: Validate Job: Specs are %s" % spec)
        try:
            checklist = spec['attrs']
        except:
            checklist = {}
        #del(checklist['action'])
        try:
            nodecount = spec['nodecount']
        except:
            nodecount = 1
        glossary = self.resources.glossary
        dnelist = []  # for attributes which do not exist in glossary
        badlist = []  # for attributes in glossary which do not exist
        ##################################
        ###  Look at this as a future change
        ###  Think:  Refresh Resources Info
        ##################################
        #1st step:  Are there enough nodes at all?
        if nodecount >= self.resources.node_count():
            pass
        else:
            raise Exception(
                "Validate Job: Not enough nodes; Requested %s, only have %s in the system."
                % (nodecount, self.resources.nodecount()))
        for att in checklist:
            val = checklist[att]
            try:
                if val in glossary[att]:
                    pass
                else:
                    badlist.append("%s:%s" % (att, val))  # Bad attribute
            except:
                dnelist.append(att)  #Attribute does not exist
            checklist['current reservation'] = 9999999
            checklist['allocatable'] = 'True'
            retlist = self.resources >= checklist
            retcount = len(retlist)
            goodlen = retcount >= nodecount
            if goodlen and not badlist and not dnelist:
                return spec  #Good Job!
            else:
                retstr = "Validate Job: "
                if badlist or dnelist:
                    if badlist:
                        restr += "No value for attribute: %s" % badlist
                    if dnelist:
                        retstr += "Attributes Do Not Exist: %s" % dnelist
                else:
                    retstr += "Need %s nodes, only have %s nodes:  %s" % (
                        nodecount, retcount, retlist)
                raise Exception(retstr)
        return spec

    validate_job = exposed(validate_job)

    def verify_locations(self, location_list):
        """
          Makes sure a location list is valid
          location list is a list of fully qualified strings of node names
          ex:  nodename.mcs.anl.gov
          """
        logger.debug("heckle: System: Validate Job: Verify Locations")
        return location_list in self.resources.glossary

    verify_locations = exposed(verify_locations)

    def find_job_location(self, job_location_args, end_times):
        """
          Finds a group of not-busy nodes in which to run the job
          
          Arguments:
               job_location_args -- A list of dictionaries with info about the job
                    jobid -- string identifier
                    nodes -- int number of nodes
                    queue -- string queue name
                    required -- ??
                    utility_score -- ??
                    threshold -- ??
                    walltime -- ??
                    attrs -- dictionary of attributes to match against
               end_times -- supposed time the job will end
               
          Returns: Dictionary with list of nodes a job can run on, keyed by jobid
          """
        logger.debug("heckle: System: find_job_location")
        locations = {}

        def jobsort(job):
            """Used to sort job list by utility score"""
            return job["utility_score"]

        job_location_args.sort(key=jobsort)
        #Try to match jobs to nodes which can run them
        for job in job_location_args:
            if "attrs" not in job or job["attrs"] is None:
                attrs = {}
            else:
                attrs = job['attrs']
            attrs['current reservation'] = 9999999
            attrs['allocatable'] = 'True'
            nodecount = int(job['nodes'])
            print "Heckle System: Find Job Location: Job is %s" % job
            #############################
            ###  Look at this as point of change
            ###  Think:  For node in unreserved nodes
            ###            Choose node from list
            ###            Remove node from unreserved nodes
            #############################
            print "Heckle System: Find Job Location: Free Nodes is %s" % self.resources.getfreenodes(
            )
            nodelist = (self.resources >= attrs)  # get Matching Node
            print "Nodelist at this stage is %s" % nodelist
            if len(nodelist) >= nodecount:
                print "Nodecount = %s" % nodecount
                retlist = nodelist[:nodecount]
                self.resources.allocate(retlist)
                print "Heckle System: Find Job Location: Remaining nodelist is %s" % retlist
            else:
                raise Exception(
                    "Heckle System: find_job_locations: Not Enough matching Nodes Available"
                )
            locations[job["jobid"]] = retlist
            print "Locations is now: %s" % locations
        logger.info("heckle: find_job_location: locations are %s" % locations)
        return locations

    find_job_location = exposed(find_job_location)

    def find_queue_equivalence_classes(self, reservation_dict,
                                       active_queue_names):
        """
        Finds equivalent queues
        An equivalent queue is a queue which can run upon the same partition(s)
        For now, with one partition (everything!) this is irrelevant.
        Returns: equiv= [{'reservations': [], 'queues': ['default']}]
        """
        logger.debug("Heckle System: find queue equivalence classes")
        equiv = []
        #print "Reservation_Dict is: %s" % reservation_dict
        #print "Active_queue_names is %s" % active_queue_names
        #print "Queue assignments are: %s" % self.queue_assignments
        for queue in self.queue_assignments:
            # skip queues that aren't running
            if not queue in active_queue_names:
                continue
            found_a_match = False
            print "Heckle Queue is %s" % queue
            for equ in equiv:
                print "Heckle Equ is %s" % equ
                if equ['data'].intersection(self.queue_assignments[queue]):
                    equ['queues'].add(queue)
                    equ['data'].update(self.queue_assignments[queue])
                    found_a_match = True
                    break
            if not found_a_match:
                equiv.append({
                    'queues': set([queue]),
                    'data': set(self.queue_assignments[queue]),
                    'reservations': set()
                })
        real_equiv = []
        for eq_class in equiv:
            found_a_match = False
            for equ in real_equiv:
                if equ['queues'].intersection(eq_class['queues']):
                    equ['queues'].update(eq_class['queues'])
                    equ['data'].update(eq_class['data'])
                    found_a_match = True
                    break
            if not found_a_match:
                real_equiv.append(eq_class)
        equiv = real_equiv
        for eq_class in equiv:
            for res_name in reservation_dict:
                for host_name in reservation_dict[res_name].split(":"):
                    if host_name in eq_class['data']:
                        eq_class['reservations'].add(res_name)
            for key in eq_class:
                eq_class[key] = list(eq_class[key])
            del eq_class['data']
        return equiv

    find_queue_equivalence_classes = exposed(find_queue_equivalence_classes)

    def get_partitions(self, locations):
        """
          Work-around to get the cqadm to run a single job on this system
          PRE:  locations is a list of dict of strings of possible node names
          POST:  if good, return locations
                 if not good, raise exception and list bad nodes
          """
        nodelist = self.resources.Glossary.nodelist
        logger.debug("Heckle System: get_partitions: raw is are: %s" %
                     locations)
        logger.debug("Heckle System: get_partitions: vals are: %s" % locs)
        if locations in nodelist:
            return locations
        else:
            raise Exception(
                "heckle: System: get_partition: Bad Locations: %s " %
                badlocations)

    get_partitions = exposed(get_partitions)
Esempio n. 5
0
 def __init__(self, *args, **kwargs):
     Component.__init__(self, *args, **kwargs)
     self.process_groups = ProcessGroupDict()
     self.process_groups.item_cls = HeckleProcessGroup
     self.queue_assignments["default"] = self.get_resources()
     self.hacky_forbidden_nodes = []   #This is a temporary fix for the forbidden nodes issue
Esempio n. 6
0
class HeckleSystem(Component):
    """
    Cobalt System component for handling / interacting with Heckle resource manager
    
    External Methods:
        add_process_groups -- allocates nodes
        get_process_groups -- get process groups based on specs
        signal_process_groups -- signal a process group
        wait_process_groups -- removed process groups based on specs
        
    Internal Methods:
        __init__:
        _start_pg:
        _check_builds_done:
        _wait:
        _release_resources:
        get_resources:
        
    Queue Manager Methods:
        validate_job:
        verify_locations:
        find_job_locations:
        find_queue_equivalence_classes:
    """
        
    name = "system"
    implementation = "HeckleBreadboard"
    queue_assignments = {}

    def __init__(self, *args, **kwargs):
        Component.__init__(self, *args, **kwargs)
        self.process_groups = ProcessGroupDict()
        self.process_groups.item_cls = HeckleProcessGroup
        self.queue_assignments["default"] = self.get_resources()
        self.hacky_forbidden_nodes = []   #This is a temporary fix for the forbidden nodes issue
    def __repr__(self):
        """
        printout representation of the class
        """
        indict = self.__dict__
        printstr = ""
        printstr += "Heckle System Object: Values"
        for element in indict:
            printstr += str(element) + "::"
            if indict[element] == None:
                printstr += "None, "
            else:
                printstr += str(indict[element]) + ", "
        printstr += "   Process Groups:"
        for element in self.process_groups:
            printstr += str(element) + "::" + \
            str(self.process_groups[element]) + ", "
        return printstr
    #####################
    # Main set of methods
    #####################
    def add_process_groups(self, specs):
        """
        This function takes the specs (a list of jobs) and initiates each job as
        a process group.
        The process group abstracts the actual job into an object, providing a 
        single point of control and interaction for all the nodes within that job.
        Each job is described by a dict.  Each dict contains:
            size:  
            kernel: a String, the name of the kernel image to load.
            executable: A string, the name of the command to execute upon the
                head node; this could be considered the actual job's file.
            stdin, stdout, stderr:  Three separate strings, each containing
                the file to use for standard communication with the job as it
                is running.  May be specified, or False.
            kerneloptions: A string containing various options for the kernel,
                or False.
            args: A list
            umask: An integer
            jobid: An integer
            cobalt_log_file: A string containing the log file to use in the
                initiation and running of the job itself.
            location:  List of strings of node / resource names
            env:  A dict of key:value strings, specifying the environment in
                which the job is to run on the node
            id: A number
            mode:
            nodect:
            cwd:  A string, specifying the current working directory in which
                to run the job on the node
            walltime:  Integer; the time, in minutes, allocated for the job
                to run on the node.
            user:  A string, the name of the user under which this job is to run.
        """
        logstr = "System:add_process_groups:"
        LOGGER.debug( logstr + "Specs are %s" % specs )
        return self.process_groups.q_add(specs)
    add_process_groups = exposed(query(add_process_groups))
    
    
    def get_process_groups(self, specs):
        """get a list of existing allocations"""
        LOGGER.debug( "System:get_process_groups: specs are %s" % specs )
        self._wait()
        return self.process_groups.q_get(specs)
    get_process_groups = exposed(query(get_process_groups))
    
    
    def signal_process_groups(self, specs, sig):
        """Free the specified process group (set of allocated nodes)"""
        LOGGER.debug( 
        "System:signal_process_groups: Specs are %s, sig is %s"\
        % (specs, sig) )
        return self.process_groups.q_get(specs, lambda x, y:x.signal(y), sig)
    signal_process_groups = exposed(query(signal_process_groups))
    
    
    def wait_process_groups(self, specs):
        """Remove terminated process groups"""
        LOGGER.debug( "System:wait_process_groups; specs are %s" % specs )
        return self.process_groups.q_del(specs, lambda x, \
        _:self._release_resources(x))
    wait_process_groups = exposed(query(wait_process_groups))
    
    
    #########################################
    # Methods for dealing with Process Groups
    #########################################
    
    
    def _check_builds_done(self):
        """
        Check to see if the nodes are done building
        Starts the process group if all nodes in them are done building
        """
        #LOGGER.debug( "System:Check Build Done: Waiting to Start..." )
        #sleep(20)
        exstr = "System:check_build_done:"
        retval = True
        pg_list = [x for x in self.process_groups.itervalues()\
        if (len(x.pinging_nodes) > 0)]
        hiccup = HeckleConnector()
        for pgp in pg_list:
            for nodename in pgp.pinging_nodes:
                teststr = hiccup.get_node_bootstate(nodename)
                if teststr == "READY":
                    if 'fakebuild' in pgp.__dict__ and pgp.fakebuild:
                        pgp.pinging_nodes.remove(nodename)
                        LOGGER.debug( exstr + "Node %s done building; "\
                             + "%s pinging nodes left" %\
                             ( nodename, len(pgp.pinging_nodes)-1 ) )
                    else:
                        LOGGER.debug( exstr + "Node %s not done yet" %\
                                          nodename )
                if  teststr == "COMPLETED":
                    LOGGER.debug( exstr + 
                         "Removing node %s...%i pinging nodes left" \
                              % (nodename, len(pgp.pinging_nodes)-1) )
                    pgp.pinging_nodes.remove(nodename)
                elif teststr in ["BOOTING", "", ""]:
                    LOGGER.debug( exstr +
                    "Node %s not done yet." % nodename)
                elif teststr == "UNALLOCATED":
                    raise Exception( exstr +
        "Node 'UNALLOCATED'; Possible build error, or system timed out.")
                elif teststr == "CRITFAIL":
                    raise Exception( exstr +
                "Node says, 'CRITFAIL'.  It timed out while building.")
                #####################
                ####     Need to figure a better way to fail gracefully
                #####################
            if len(pgp.pinging_nodes) == 0:
                LOGGER.debug( 
    "System:Check Build Done: No Pinging Nodes left, Start PG %s Running." \
        % pgp.jobid)
                pgp.start()
            else:
                retval = False
        return retval
    _check_builds_done = automatic(_check_builds_done)
    
    
    def _wait(self):
        """
        Calls the process group container's wait() method
        """
        waitlen = len( self.process_groups.keys() )
        LOGGER.debug( "System:_wait:%s process groups." % waitlen )
        for pgp in self.process_groups.itervalues():
            pgp.wait()
            try:
                del( self.hacky_forbidden_nodes[pgp.location] )
            except:
                pass
    _wait = automatic(_wait)
    
    
    def _release_resources(self, pgp):
        """
        Releases all the Heckle nodes, unreserving them
        """
        LOGGER.debug( "System:release" )
        LOGGER.debug( "System:Locations are: %s" % pgp.location )
        hiccup = HeckleConnector()
        hiccup.free_reserved_node( uid = pgp.uid, node_list=pgp.location )
        try:
            del( self.hacky_forbidden_nodes[pgp.location] )
        except:
            pass
    
    
    def get_resources(self, specs=None ):
        """
        Returns a list of free resources (nodes) which match the given specs.
        Specs is a dict which describes a job
        """
        LOGGER.debug( "System:get Resources" )
        ##################################
        ###  Look at this as a future change
        ##################################
        hiccup = HeckleConnector()
        if not specs:
            return hiccup.node_list
        else:
            return hiccup.list_available_nodes( **specs )
    get_resources = exposed(query(get_resources))
    
    
    ##########################################################
    # Methods for interacting with scheduler and queue-manager
    ##########################################################
    
    
    def validate_job(self, spec):
        """
        Validates a job for submission
        -- will the job ever run under the current Heckle configuration?
        Steps:
            1)  Validate Kernel
            2)  Validate HW
            3)  Validate Job versus overall
        """
        LOGGER.debug( "System:Validate Job: Specs are %s" % spec )
        hiccup = HeckleConnector()
        try:
            kernel = spec['kernel']
            valid_kernel = hiccup.validkernel( kernel )
            if not valid_kernel:
                raise Exception("System:Validate Job: Bad Kernel")
        except:
            spec['kernel'] = 'default'
        try:
            valid_hw = hiccup.validhw( **spec['attrs'] )
            if not valid_hw:
                raise Exception(
                "System:Validate Job: Bad Hardware Specs: %s" % spec )
        except Exception as strec:
            raise Exception("System:Validate Job:  Validate Job: %s" % strec)
        #try:
            #valid_job = hiccup.validjob( **spec )
            #if not valid_job:
                #raise Exception(
                #"System: validate Job:  Never enough nodes")
        #except:
            #raise Exception("System: validate Job: Never enough nodes")
        return spec
    validate_job = exposed(validate_job)
    
    
    def verify_locations(self, location_list):
        """
        Makes sure a location list is valid
        location list is a list of fully qualified strings of node names
        ex:  nodename.mcs.anl.gov
        """
        LOGGER.debug("System:validate Job: Verify Locations")
        hiccup = HeckleConnector()
        heckle_set = set(hiccup.list_all_nodes())
        location_set = set(location_list)
        if heckle_set >= location_set:
            return location_list
        else:
            not_valid_list = list( location_set.difference( heckle_set ) )
            raise Exception(
    "System:VerifyLocations: Invalid location names: %s" % not_valid_list)
    verify_locations = exposed( verify_locations )
    
    
    def find_job_location(self, job_location_args, end_times):
        """
        Finds a group of not-busy nodes in which to run the job
        
        Arguments:
            job_location_args -- A list of dictionaries with info about the job
                jobid -- string identifier
                nodes -- int number of nodes
                queue -- string queue name
                required -- ??
                utility_score -- ??
                threshold -- ??
                walltime -- ??
                attrs -- dictionary of attributes to match against
            end_times -- supposed time the job will end
            
        Returns: Dictionary with list of nodes a job can run on, keyed by jobid
        """
        LOGGER.debug("System:find_job_location" )
        locations = {}
        def jobsort(job):
            """Used to sort job list by utility score"""
            return job["utility_score"]
        job_location_args.sort(key=jobsort)
        
        #Try to match jobs to nodes which can run them
        hiccup = HeckleConnector()
        for job in job_location_args:
            if "attrs" not in job or job["attrs"] is None:
                job["attrs"] = {}
            print "Job is %s" % job
            tempjob = job.copy()
            if self.hacky_forbidden_nodes:
                if 'forbidden' not in tempjob.keys():
                    tempjob['forbidden'] = self.hacky_forbidden_nodes
                else:
                    tempjob['forbidden'].extend( self.hacky_forbidden_nodes )
            #############################
            ###  Look at this as point of change
            ###  Think:  For node in unreserved nodes
            ###            Choose node from list
            ###            Remove node from unreserved nodes
            #############################
            try:
                resources = hiccup.find_job_location(**job)  #get matching nodes
                if not resources:
                    continue
            except Exception as err:
                LOGGER.info("System:find_job_location: Error %s" % err)
                continue
            node_list = []
            # Build a list of appropriate nodes
            for node in resources:
                node_list.append(node)
                self.hacky_forbidden_nodes.append(node)
            locations[job["jobid"]] = node_list
        LOGGER.info("System:find_job_location: locations are %s" % locations )
        return locations
    find_job_location = exposed(find_job_location)
    
    
    def find_queue_equivalence_classes(self, reservation_dict, \
                                                        active_queue_names):
        """
        Finds equivalent queues
        An equivalent queue is a queue which can run upon the same partition(s)
        For now, with one partition (everything!) this is irrelevant.
        Returns: equiv= [{'reservations': [], 'queues': ['default']}]
        """
        #LOGGER.debug("System:find queue equivalence classes" )
        equiv = []
        #print "Reservation_Dict is: %s" % reservation_dict
        #print "Active_queue_names is %s" % active_queue_names
        #print "Queue assignments are: %s" % self.queue_assignments
        for queue in self.queue_assignments:
            # skip queues that aren't running
            if not queue in active_queue_names:
                continue
            found_a_match = False
            #print "Heckle Queue is %s" % queue
            for equ in equiv:
                print "Heckle Equ is %s" % equ
                if equ['data'].intersection(self.queue_assignments[queue]):
                    equ['queues'].add(queue)
                    equ['data'].update(self.queue_assignments[queue])
                    found_a_match = True
                    break
            if not found_a_match:
                equiv.append({'queues': set([queue]),
                                'data': set(self.queue_assignments[queue]),
                                'reservations': set()})
        real_equiv = []
        for eq_class in equiv:
            found_a_match = False
            for equ in real_equiv:
                if equ['queues'].intersection(eq_class['queues']):
                    equ['queues'].update(eq_class['queues'])
                    equ['data'].update(eq_class['data'])
                    found_a_match = True
                    break
            if not found_a_match:
                real_equiv.append(eq_class)
        equiv = real_equiv
        for eq_class in equiv:
            for res_name in reservation_dict:
                for host_name in reservation_dict[res_name].split(":"):
                    if host_name in eq_class['data']:
                        eq_class['reservations'].add(res_name)
            for key in eq_class:
                eq_class[key] = list(eq_class[key])
            del eq_class['data']
        return equiv
    find_queue_equivalence_classes = exposed(find_queue_equivalence_classes)
    
    
    def get_partitions(self, locations):
        """
        Work-around to get the cqadm to run a single job on this system
        PRE:  locations is a list of dict of strings of possible node names
        POST:  if good, return locations
                if not good, raise exception and list bad nodes
        """
        logstr = "System:get_partition: "
        hiccup = HeckleConnector()
        heckle_node_set = set(hiccup.list_all_nodes())
        locs = locations[0]['name']
        LOGGER.debug( logstr + "raw is are: %s" % locations )
        LOGGER.debug( logstr + "vals are: %s" % locs )
        if type(locs) == ListType:
            locset = set(locs)
            badlocations = locset.difference(heckle_node_set)
            if badlocations:
                raise Exception(
        logstr + "Bad Locations: %s " % list(badlocations) )
        elif type(locs) == StringType:
            if locs not in locations:
                raise Exception( logstr + "Bad Locations: %s" % locs)
        else:
            raise Exception( logstr + 
"location needs to be string or list of strings, you provided %s : %s" \
% ( type(locs), locs))
        return locations
    get_partitions = exposed(get_partitions)
Esempio n. 7
0
class HeckleSystem(Component):
    """
    Cobalt System component for handling / interacting with Heckle resource manager
    
    External Methods:
        add_process_groups -- allocates nodes
        get_process_groups -- get process groups based on specs
        signal_process_groups -- signal a process group
        wait_process_groups -- removed process groups based on specs
        
    Internal Methods:
        __init__:
        _start_pg:
        _check_builds_done:
        _wait:
        _release_resources:
        get_resources:
        
    Queue Manager Methods:
        validate_job:
        verify_locations:
        find_job_locations:
        find_queue_equivalence_classes:
    """

    name = "system"
    implementation = "HeckleBreadboard"
    queue_assignments = {}

    def __init__(self, *args, **kwargs):
        Component.__init__(self, *args, **kwargs)
        self.process_groups = ProcessGroupDict()
        self.process_groups.item_cls = HeckleProcessGroup
        self.queue_assignments["default"] = self.get_resources()
        self.hacky_forbidden_nodes = [
        ]  #This is a temporary fix for the forbidden nodes issue

    def __repr__(self):
        """
        printout representation of the class
        """
        indict = self.__dict__
        printstr = ""
        printstr += "Heckle System Object: Values"
        for element in indict:
            printstr += str(element) + "::"
            if indict[element] == None:
                printstr += "None, "
            else:
                printstr += str(indict[element]) + ", "
        printstr += "   Process Groups:"
        for element in self.process_groups:
            printstr += str(element) + "::" + \
            str(self.process_groups[element]) + ", "
        return printstr

    #####################
    # Main set of methods
    #####################
    def add_process_groups(self, specs):
        """
        This function takes the specs (a list of jobs) and initiates each job as
        a process group.
        The process group abstracts the actual job into an object, providing a 
        single point of control and interaction for all the nodes within that job.
        Each job is described by a dict.  Each dict contains:
            size:  
            kernel: a String, the name of the kernel image to load.
            executable: A string, the name of the command to execute upon the
                head node; this could be considered the actual job's file.
            stdin, stdout, stderr:  Three separate strings, each containing
                the file to use for standard communication with the job as it
                is running.  May be specified, or False.
            kerneloptions: A string containing various options for the kernel,
                or False.
            args: A list
            umask: An integer
            jobid: An integer
            cobalt_log_file: A string containing the log file to use in the
                initiation and running of the job itself.
            location:  List of strings of node / resource names
            env:  A dict of key:value strings, specifying the environment in
                which the job is to run on the node
            id: A number
            mode:
            nodect:
            cwd:  A string, specifying the current working directory in which
                to run the job on the node
            walltime:  Integer; the time, in minutes, allocated for the job
                to run on the node.
            user:  A string, the name of the user under which this job is to run.
        """
        logstr = "System:add_process_groups:"
        LOGGER.debug(logstr + "Specs are %s" % specs)
        return self.process_groups.q_add(specs)

    add_process_groups = exposed(query(add_process_groups))

    def get_process_groups(self, specs):
        """get a list of existing allocations"""
        LOGGER.debug("System:get_process_groups: specs are %s" % specs)
        self._wait()
        return self.process_groups.q_get(specs)

    get_process_groups = exposed(query(get_process_groups))

    def signal_process_groups(self, specs, sig):
        """Free the specified process group (set of allocated nodes)"""
        LOGGER.debug(
        "System:signal_process_groups: Specs are %s, sig is %s"\
        % (specs, sig) )
        return self.process_groups.q_get(specs, lambda x, y: x.signal(y), sig)

    signal_process_groups = exposed(query(signal_process_groups))

    def wait_process_groups(self, specs):
        """Remove terminated process groups"""
        LOGGER.debug("System:wait_process_groups; specs are %s" % specs)
        return self.process_groups.q_del(specs, lambda x, \
        _:self._release_resources(x))

    wait_process_groups = exposed(query(wait_process_groups))

    #########################################
    # Methods for dealing with Process Groups
    #########################################

    def _check_builds_done(self):
        """
        Check to see if the nodes are done building
        Starts the process group if all nodes in them are done building
        """
        #LOGGER.debug( "System:Check Build Done: Waiting to Start..." )
        #sleep(20)
        exstr = "System:check_build_done:"
        retval = True
        pg_list = [x for x in self.process_groups.itervalues()\
        if (len(x.pinging_nodes) > 0)]
        hiccup = HeckleConnector()
        for pgp in pg_list:
            for nodename in pgp.pinging_nodes:
                teststr = hiccup.get_node_bootstate(nodename)
                if teststr == "READY":
                    if 'fakebuild' in pgp.__dict__ and pgp.fakebuild:
                        pgp.pinging_nodes.remove(nodename)
                        LOGGER.debug( exstr + "Node %s done building; "\
                             + "%s pinging nodes left" %\
                             ( nodename, len(pgp.pinging_nodes)-1 ) )
                    else:
                        LOGGER.debug( exstr + "Node %s not done yet" %\
                                          nodename )
                if teststr == "COMPLETED":
                    LOGGER.debug( exstr +
                         "Removing node %s...%i pinging nodes left" \
                              % (nodename, len(pgp.pinging_nodes)-1) )
                    pgp.pinging_nodes.remove(nodename)
                elif teststr in ["BOOTING", "", ""]:
                    LOGGER.debug(exstr + "Node %s not done yet." % nodename)
                elif teststr == "UNALLOCATED":
                    raise Exception(
                        exstr +
                        "Node 'UNALLOCATED'; Possible build error, or system timed out."
                    )
                elif teststr == "CRITFAIL":
                    raise Exception(
                        exstr +
                        "Node says, 'CRITFAIL'.  It timed out while building.")
                #####################
                ####     Need to figure a better way to fail gracefully
                #####################
            if len(pgp.pinging_nodes) == 0:
                LOGGER.debug(
                "System:Check Build Done: No Pinging Nodes left, Start PG %s Running." \
        % pgp.jobid)
                pgp.start()
            else:
                retval = False
        return retval

    _check_builds_done = automatic(_check_builds_done)

    def _wait(self):
        """
        Calls the process group container's wait() method
        """
        waitlen = len(self.process_groups.keys())
        LOGGER.debug("System:_wait:%s process groups." % waitlen)
        for pgp in self.process_groups.itervalues():
            pgp.wait()
            try:
                del (self.hacky_forbidden_nodes[pgp.location])
            except:
                pass

    _wait = automatic(_wait)

    def _release_resources(self, pgp):
        """
        Releases all the Heckle nodes, unreserving them
        """
        LOGGER.debug("System:release")
        LOGGER.debug("System:Locations are: %s" % pgp.location)
        hiccup = HeckleConnector()
        hiccup.free_reserved_node(uid=pgp.uid, node_list=pgp.location)
        try:
            del (self.hacky_forbidden_nodes[pgp.location])
        except:
            pass

    def get_resources(self, specs=None):
        """
        Returns a list of free resources (nodes) which match the given specs.
        Specs is a dict which describes a job
        """
        LOGGER.debug("System:get Resources")
        ##################################
        ###  Look at this as a future change
        ##################################
        hiccup = HeckleConnector()
        if not specs:
            return hiccup.node_list
        else:
            return hiccup.list_available_nodes(**specs)

    get_resources = exposed(query(get_resources))

    ##########################################################
    # Methods for interacting with scheduler and queue-manager
    ##########################################################

    def validate_job(self, spec):
        """
        Validates a job for submission
        -- will the job ever run under the current Heckle configuration?
        Steps:
            1)  Validate Kernel
            2)  Validate HW
            3)  Validate Job versus overall
        """
        LOGGER.debug("System:Validate Job: Specs are %s" % spec)
        hiccup = HeckleConnector()
        try:
            kernel = spec['kernel']
            valid_kernel = hiccup.validkernel(kernel)
            if not valid_kernel:
                raise Exception("System:Validate Job: Bad Kernel")
        except:
            spec['kernel'] = 'default'
        try:
            valid_hw = hiccup.validhw(**spec['attrs'])
            if not valid_hw:
                raise Exception("System:Validate Job: Bad Hardware Specs: %s" %
                                spec)
        except Exception as strec:
            raise Exception("System:Validate Job:  Validate Job: %s" % strec)
        #try:
        #valid_job = hiccup.validjob( **spec )
        #if not valid_job:
        #raise Exception(
        #"System: validate Job:  Never enough nodes")
        #except:
        #raise Exception("System: validate Job: Never enough nodes")
        return spec

    validate_job = exposed(validate_job)

    def verify_locations(self, location_list):
        """
        Makes sure a location list is valid
        location list is a list of fully qualified strings of node names
        ex:  nodename.mcs.anl.gov
        """
        LOGGER.debug("System:validate Job: Verify Locations")
        hiccup = HeckleConnector()
        heckle_set = set(hiccup.list_all_nodes())
        location_set = set(location_list)
        if heckle_set >= location_set:
            return location_list
        else:
            not_valid_list = list(location_set.difference(heckle_set))
            raise Exception(
                "System:VerifyLocations: Invalid location names: %s" %
                not_valid_list)

    verify_locations = exposed(verify_locations)

    def find_job_location(self, job_location_args, end_times):
        """
        Finds a group of not-busy nodes in which to run the job
        
        Arguments:
            job_location_args -- A list of dictionaries with info about the job
                jobid -- string identifier
                nodes -- int number of nodes
                queue -- string queue name
                required -- ??
                utility_score -- ??
                threshold -- ??
                walltime -- ??
                attrs -- dictionary of attributes to match against
            end_times -- supposed time the job will end
            
        Returns: Dictionary with list of nodes a job can run on, keyed by jobid
        """
        LOGGER.debug("System:find_job_location")
        locations = {}

        def jobsort(job):
            """Used to sort job list by utility score"""
            return job["utility_score"]

        job_location_args.sort(key=jobsort)

        #Try to match jobs to nodes which can run them
        hiccup = HeckleConnector()
        for job in job_location_args:
            if "attrs" not in job or job["attrs"] is None:
                job["attrs"] = {}
            print "Job is %s" % job
            tempjob = job.copy()
            if self.hacky_forbidden_nodes:
                if 'forbidden' not in tempjob.keys():
                    tempjob['forbidden'] = self.hacky_forbidden_nodes
                else:
                    tempjob['forbidden'].extend(self.hacky_forbidden_nodes)
            #############################
            ###  Look at this as point of change
            ###  Think:  For node in unreserved nodes
            ###            Choose node from list
            ###            Remove node from unreserved nodes
            #############################
            try:
                resources = hiccup.find_job_location(**
                                                     job)  #get matching nodes
                if not resources:
                    continue
            except Exception as err:
                LOGGER.info("System:find_job_location: Error %s" % err)
                continue
            node_list = []
            # Build a list of appropriate nodes
            for node in resources:
                node_list.append(node)
                self.hacky_forbidden_nodes.append(node)
            locations[job["jobid"]] = node_list
        LOGGER.info("System:find_job_location: locations are %s" % locations)
        return locations

    find_job_location = exposed(find_job_location)


    def find_queue_equivalence_classes(self, reservation_dict, \
                                                        active_queue_names):
        """
        Finds equivalent queues
        An equivalent queue is a queue which can run upon the same partition(s)
        For now, with one partition (everything!) this is irrelevant.
        Returns: equiv= [{'reservations': [], 'queues': ['default']}]
        """
        #LOGGER.debug("System:find queue equivalence classes" )
        equiv = []
        #print "Reservation_Dict is: %s" % reservation_dict
        #print "Active_queue_names is %s" % active_queue_names
        #print "Queue assignments are: %s" % self.queue_assignments
        for queue in self.queue_assignments:
            # skip queues that aren't running
            if not queue in active_queue_names:
                continue
            found_a_match = False
            #print "Heckle Queue is %s" % queue
            for equ in equiv:
                print "Heckle Equ is %s" % equ
                if equ['data'].intersection(self.queue_assignments[queue]):
                    equ['queues'].add(queue)
                    equ['data'].update(self.queue_assignments[queue])
                    found_a_match = True
                    break
            if not found_a_match:
                equiv.append({
                    'queues': set([queue]),
                    'data': set(self.queue_assignments[queue]),
                    'reservations': set()
                })
        real_equiv = []
        for eq_class in equiv:
            found_a_match = False
            for equ in real_equiv:
                if equ['queues'].intersection(eq_class['queues']):
                    equ['queues'].update(eq_class['queues'])
                    equ['data'].update(eq_class['data'])
                    found_a_match = True
                    break
            if not found_a_match:
                real_equiv.append(eq_class)
        equiv = real_equiv
        for eq_class in equiv:
            for res_name in reservation_dict:
                for host_name in reservation_dict[res_name].split(":"):
                    if host_name in eq_class['data']:
                        eq_class['reservations'].add(res_name)
            for key in eq_class:
                eq_class[key] = list(eq_class[key])
            del eq_class['data']
        return equiv

    find_queue_equivalence_classes = exposed(find_queue_equivalence_classes)

    def get_partitions(self, locations):
        """
        Work-around to get the cqadm to run a single job on this system
        PRE:  locations is a list of dict of strings of possible node names
        POST:  if good, return locations
                if not good, raise exception and list bad nodes
        """
        logstr = "System:get_partition: "
        hiccup = HeckleConnector()
        heckle_node_set = set(hiccup.list_all_nodes())
        locs = locations[0]['name']
        LOGGER.debug(logstr + "raw is are: %s" % locations)
        LOGGER.debug(logstr + "vals are: %s" % locs)
        if type(locs) == ListType:
            locset = set(locs)
            badlocations = locset.difference(heckle_node_set)
            if badlocations:
                raise Exception(logstr +
                                "Bad Locations: %s " % list(badlocations))
        elif type(locs) == StringType:
            if locs not in locations:
                raise Exception(logstr + "Bad Locations: %s" % locs)
        else:
            raise Exception( logstr +
                             "location needs to be string or list of strings, you provided %s : %s" \
% ( type(locs), locs))
        return locations

    get_partitions = exposed(get_partitions)
Esempio n. 8
0
class HeckleSystem(Component):
    """
     Cobalt System component for handling / interacting with Heckle resource manager
     
     External Methods:
          add_process_groups -- allocates nodes
          get_process_groups -- get process groups based on specs
          signal_process_groups -- signal a process group
          wait_process_groups -- removed process groups based on specs
          
     Internal Methods:
          __init__:
          _start_pg:
          _check_builds_done:
          _wait:
          _release_resources:
          get_resources:
          
     Queue Manager Methods:
          validate_job:
          verify_locations:
          find_job_locations:
          find_queue_equivalence_classes:
     """

    name = "system"
    implementation = "HeckleBreadboard"
    queue_assignments = {}

    def __init__(self, *args, **kwargs):
        logger.debug(
            "heckle: System: init ... %s ... &&&&&&&&&&&&&&&&&&&&&&&&&&&&&  I am here as well &&&&&&&&&&&&&&&&&&&&&&&&&"
            % threading.current_thread().getName()
        )
        Component.__init__(self, *args, **kwargs)
        self.process_groups = ProcessGroupDict()
        self.process_groups.item_cls = HeckleProcessGroup
        self.resources = ResourceDict()
        self.queue_assignments["default"] = self.resources.keys()
        print "\n\n\n\n"
        print "Queue assignments are: %s" % self.queue_assignments

    def __repr__(self):
        """
          printout representation of the class
          """
        indict = self.__dict__
        printstr = ""
        printstr += "Heckle System Object: Values"
        for element in indict:
            printstr += str(element) + "::"
            if indict[element] == None:
                printstr += "None, "
            else:
                printstr += str(indict[element]) + ", "
        printstr += "   Process Groups:"
        for element in self.process_groups:
            printstr += str(element) + "::" + str(self.process_groups[element]) + ", "
        return printstr

    #####################
    # Main set of methods
    #####################
    def add_process_groups(self, specs):
        """
          Allocate nodes and add the list of those allocated to the PGDict
          specs is a list of dictionaries
          Each dictionary contains the specifications for all the nodes in the process group
          """
        # Debug - Take out to really rebuild
        ####    Need to check the environment variable for fakebuild
        try:
            specs[0]["fakebuild"] = specs[0]["env"]["fakebuild"]
            del specs[0]["env"]["fakebuild"]
        except:
            pass
        print "Heckle System:  add_process_groups: <<<<<<<<<<<<<<<<<<          OK< Debug< This< :  %s" % specs
        HICCUP = HeckleConnector()
        # try:
        reservation = HICCUP.make_reservation(**(specs[0]))
        heckle_res_id = reservation.id
        uid = specs[0]["user"]
        logger.debug("Heckle System: heckle_res_id = %i" % heckle_res_id)
        specs[0]["heckle_res_id"] = heckle_res_id
        return self.process_groups.q_add(specs, lambda x, _: self._start_pg(x, heckle_res_id=heckle_res_id, uid=uid))
        # except Exception as hec_except:
        ## could do something here about problems
        ##    1)  Kill job, then resubmit job w/o node name(s)
        ##         Would require access to cqadm via api
        ##    2)  Put job / node in fail state
        ##    3)  Simply fail
        # raise Exception("Heckle System Object: add_process_groups: %s" % hec_except)

    add_process_groups = exposed(query(add_process_groups))

    def get_process_groups(self, specs):
        """get a list of existing allocations"""
        # logger.debug( "Heckle System: get_process_groups" )
        self._wait()
        return self.process_groups.q_get(specs)

    get_process_groups = exposed(query(get_process_groups))

    def signal_process_groups(self, specs, sig):
        """Free the specified process group (set of allocated nodes)"""
        logger.debug("Heckle System: signal_process_groups: Specs are %s, sig is %s" % (specs, sig))
        return self.process_groups.q_get(specs, lambda x, y: x.signal(y), sig)

    signal_process_groups = exposed(query(signal_process_groups))

    def wait_process_groups(self, specs):
        """Remove terminated process groups"""
        logger.debug("Heckle System: wait_process_groups; specs are %s" % specs)
        return self.process_groups.q_del(specs, lambda x, _: self._release_resources(x))

    wait_process_groups = exposed(query(wait_process_groups))

    #########################################
    # Methods for dealing with Process Groups
    #########################################

    def _start_pg(self, pgp, heckle_res_id, uid):
        """
          Populates the process group with its resources
               gets node information for nodes in process group
               Updates those attributes
               Places nodes in the pinging nodes list, to see if they're built
          """
        logger.debug("Heckle System: start_pg: PGP is %s" % pgp)
        nodelist = pgp.location
        for node in nodelist:
            node_attributes = self.resources[node]
            node_attributes["mac"] = node_attributes["mac"].replace("-", ":")
            node_attributes["heckle_res_id"] = heckle_res_id
            pgp.resource_attributes[node] = node_attributes._get_dict()
        pgp.uid = uid
        pgp.pinging_nodes.append(nodelist)

    add_process_groups = exposed(query(add_process_groups))

    def _check_builds_done(self):
        """
          Check to see if the nodes are done building
          Starts the process group if all nodes in them are done building
          """
        # logger.debug( "heckle: System: Check Build Done: Waiting to Start..." )
        # sleep(20)
        retval = True
        pg_list = [x for x in self.process_groups.itervalues() if (len(x.pinging_nodes) > 0)]
        self.resources.update()
        for pgp in pg_list:
            for nodename in pgp.pinging_nodes:
                teststr = self.resources[nodename]["bootstate"]
                if teststr == "COMPLETED":
                    logger.debug(
                        "heckle: System: Check Build Done: Removing node %s...%i pinging nodes left"
                        % (nodename, len(pgp.pinging_nodes) - 1)
                    )
                    pgp.pinging_nodes.remove(nodename)
                elif teststr in ["BOOTING", "", ""]:
                    logger.debug("Heckle System: Check Build Done: Node %s not done yet." % nodename)
                elif teststr == "UNALLOCATED":
                    raise Exception(
                        "HIC_SO: _check_builds_done: Node says, 'UNALLOCATED'.  Possible build error, or system timed out."
                    )
                elif teststr == "CRITFAIL":
                    raise Exception("HIC_SO: _check_builds_done: Node says, 'CRITFAIL'.  It timed out while building.")
                    #####################
                    ####      Need to figure a better way to fail gracefully on this one...
                    #####################
                elif teststr == "READY":
                    raise Exception(
                        "HIC_SO: _check_builds_done: Node says, 'READY'.  The Heckle Reservation is already ready already, skipping pinging."
                    )
            if len(pgp.pinging_nodes) == 0:
                logger.debug("Heckle System: Check Build Done: No Pinging Nodes left, Start PG Running.")
                pgp.start()
            else:
                retval = False
        return retval

    _check_builds_done = automatic(_check_builds_done)

    def _wait(self):
        """
          Calls the process group container's wait() method
          """
        # logger.debug( "Heckle System: wait" )
        for pgp in self.process_groups.itervalues():
            pgp.wait()

    _wait = automatic(_wait)

    def _release_resources(self, pgp):
        """
          Releases all the Heckle nodes, unreserving them
          """
        logger.debug("Heckle System: Release %s" % pgp.location)
        # self.resources[pgp.location]['action']='Free'
        HICCUP = HeckleConnector()
        HICCUP.free_reserved_node(uid=pgp.uid, node_list=pgp.location)
        self.resources.free(nodes=pgp.location)

    def get_resources(self, specs={}):
        """
          Returns a list of names for all the FREE resources (nodes) which match the given specs.
          """
        logger.debug("Heckle System: get Resources, specs are %s" % specs)
        ##################################
        ###  Look at this as a future change
        ##################################
        specs["current reservation"] = 9999999
        specs["allocatable"] = "True"
        res_list = self.resources >= specs
        logger.debug("Heckle System: get Resources, resources are %s" % res_list)
        return res_list

    get_resources = exposed(query(get_resources))

    ##########################################################
    # Methods for interacting with scheduler and queue-manager
    ##########################################################

    def validate_job(self, spec):
        """
          Validates a job for submission
          -- will the job ever run under the current Heckle configuration?
          Steps:
               1)  Validate Kernel
               2)  Validate HW
               3)  Validate Job versus overall
          """
        logger.debug("Heckle System: Validate Job: Specs are %s" % spec)
        try:
            checklist = spec["attrs"]
        except:
            checklist = {}
        # del(checklist['action'])
        try:
            nodecount = spec["nodecount"]
        except:
            nodecount = 1
        glossary = self.resources.glossary
        dnelist = []  # for attributes which do not exist in glossary
        badlist = []  # for attributes in glossary which do not exist
        ##################################
        ###  Look at this as a future change
        ###  Think:  Refresh Resources Info
        ##################################
        # 1st step:  Are there enough nodes at all?
        if nodecount >= self.resources.node_count():
            pass
        else:
            raise Exception(
                "Validate Job: Not enough nodes; Requested %s, only have %s in the system."
                % (nodecount, self.resources.nodecount())
            )
        for att in checklist:
            val = checklist[att]
            try:
                if val in glossary[att]:
                    pass
                else:
                    badlist.append("%s:%s" % (att, val))  # Bad attribute
            except:
                dnelist.append(att)  # Attribute does not exist
            checklist["current reservation"] = 9999999
            checklist["allocatable"] = "True"
            retlist = self.resources >= checklist
            retcount = len(retlist)
            goodlen = retcount >= nodecount
            if goodlen and not badlist and not dnelist:
                return spec  # Good Job!
            else:
                retstr = "Validate Job: "
                if badlist or dnelist:
                    if badlist:
                        restr += "No value for attribute: %s" % badlist
                    if dnelist:
                        retstr += "Attributes Do Not Exist: %s" % dnelist
                else:
                    retstr += "Need %s nodes, only have %s nodes:  %s" % (nodecount, retcount, retlist)
                raise Exception(retstr)
        return spec

    validate_job = exposed(validate_job)

    def verify_locations(self, location_list):
        """
          Makes sure a location list is valid
          location list is a list of fully qualified strings of node names
          ex:  nodename.mcs.anl.gov
          """
        logger.debug("heckle: System: Validate Job: Verify Locations")
        return location_list in self.resources.glossary

    verify_locations = exposed(verify_locations)

    def find_job_location(self, job_location_args, end_times):
        """
          Finds a group of not-busy nodes in which to run the job
          
          Arguments:
               job_location_args -- A list of dictionaries with info about the job
                    jobid -- string identifier
                    nodes -- int number of nodes
                    queue -- string queue name
                    required -- ??
                    utility_score -- ??
                    threshold -- ??
                    walltime -- ??
                    attrs -- dictionary of attributes to match against
               end_times -- supposed time the job will end
               
          Returns: Dictionary with list of nodes a job can run on, keyed by jobid
          """
        logger.debug("heckle: System: find_job_location")
        locations = {}

        def jobsort(job):
            """Used to sort job list by utility score"""
            return job["utility_score"]

        job_location_args.sort(key=jobsort)
        # Try to match jobs to nodes which can run them
        for job in job_location_args:
            if "attrs" not in job or job["attrs"] is None:
                attrs = {}
            else:
                attrs = job["attrs"]
            attrs["current reservation"] = 9999999
            attrs["allocatable"] = "True"
            nodecount = int(job["nodes"])
            print "Heckle System: Find Job Location: Job is %s" % job
            #############################
            ###  Look at this as point of change
            ###  Think:  For node in unreserved nodes
            ###            Choose node from list
            ###            Remove node from unreserved nodes
            #############################
            print "Heckle System: Find Job Location: Free Nodes is %s" % self.resources.getfreenodes()
            nodelist = self.resources >= attrs  # get Matching Node
            print "Nodelist at this stage is %s" % nodelist
            if len(nodelist) >= nodecount:
                print "Nodecount = %s" % nodecount
                retlist = nodelist[:nodecount]
                self.resources.allocate(retlist)
                print "Heckle System: Find Job Location: Remaining nodelist is %s" % retlist
            else:
                raise Exception("Heckle System: find_job_locations: Not Enough matching Nodes Available")
            locations[job["jobid"]] = retlist
            print "Locations is now: %s" % locations
        logger.info("heckle: find_job_location: locations are %s" % locations)
        return locations

    find_job_location = exposed(find_job_location)

    def find_queue_equivalence_classes(self, reservation_dict, active_queue_names):
        """
        Finds equivalent queues
        An equivalent queue is a queue which can run upon the same partition(s)
        For now, with one partition (everything!) this is irrelevant.
        Returns: equiv= [{'reservations': [], 'queues': ['default']}]
        """
        logger.debug("Heckle System: find queue equivalence classes")
        equiv = []
        # print "Reservation_Dict is: %s" % reservation_dict
        # print "Active_queue_names is %s" % active_queue_names
        # print "Queue assignments are: %s" % self.queue_assignments
        for queue in self.queue_assignments:
            # skip queues that aren't running
            if not queue in active_queue_names:
                continue
            found_a_match = False
            print "Heckle Queue is %s" % queue
            for equ in equiv:
                print "Heckle Equ is %s" % equ
                if equ["data"].intersection(self.queue_assignments[queue]):
                    equ["queues"].add(queue)
                    equ["data"].update(self.queue_assignments[queue])
                    found_a_match = True
                    break
            if not found_a_match:
                equiv.append(
                    {"queues": set([queue]), "data": set(self.queue_assignments[queue]), "reservations": set()}
                )
        real_equiv = []
        for eq_class in equiv:
            found_a_match = False
            for equ in real_equiv:
                if equ["queues"].intersection(eq_class["queues"]):
                    equ["queues"].update(eq_class["queues"])
                    equ["data"].update(eq_class["data"])
                    found_a_match = True
                    break
            if not found_a_match:
                real_equiv.append(eq_class)
        equiv = real_equiv
        for eq_class in equiv:
            for res_name in reservation_dict:
                for host_name in reservation_dict[res_name].split(":"):
                    if host_name in eq_class["data"]:
                        eq_class["reservations"].add(res_name)
            for key in eq_class:
                eq_class[key] = list(eq_class[key])
            del eq_class["data"]
        return equiv

    find_queue_equivalence_classes = exposed(find_queue_equivalence_classes)

    def get_partitions(self, locations):
        """
          Work-around to get the cqadm to run a single job on this system
          PRE:  locations is a list of dict of strings of possible node names
          POST:  if good, return locations
                 if not good, raise exception and list bad nodes
          """
        nodelist = self.resources.Glossary.nodelist
        logger.debug("Heckle System: get_partitions: raw is are: %s" % locations)
        logger.debug("Heckle System: get_partitions: vals are: %s" % locs)
        if locations in nodelist:
            return locations
        else:
            raise Exception("heckle: System: get_partition: Bad Locations: %s " % badlocations)

    get_partitions = exposed(get_partitions)