Esempio n. 1
0
class HeckleSystem(Component):
    """
    Cobalt System component for handling / interacting with Heckle resource manager
    
    External Methods:
        add_process_groups -- allocates nodes
        get_process_groups -- get process groups based on specs
        signal_process_groups -- signal a process group
        wait_process_groups -- removed process groups based on specs
        
    Internal Methods:
        __init__:
        _start_pg:
        _check_builds_done:
        _wait:
        _release_resources:
        get_resources:
        
    Queue Manager Methods:
        validate_job:
        verify_locations:
        find_job_locations:
        find_queue_equivalence_classes:
    """
        
    name = "system"
    implementation = "HeckleBreadboard"
    queue_assignments = {}

    def __init__(self, *args, **kwargs):
        Component.__init__(self, *args, **kwargs)
        self.process_groups = ProcessGroupDict()
        self.process_groups.item_cls = HeckleProcessGroup
        self.queue_assignments["default"] = self.get_resources()
        self.hacky_forbidden_nodes = []   #This is a temporary fix for the forbidden nodes issue
    def __repr__(self):
        """
        printout representation of the class
        """
        indict = self.__dict__
        printstr = ""
        printstr += "Heckle System Object: Values"
        for element in indict:
            printstr += str(element) + "::"
            if indict[element] == None:
                printstr += "None, "
            else:
                printstr += str(indict[element]) + ", "
        printstr += "   Process Groups:"
        for element in self.process_groups:
            printstr += str(element) + "::" + \
            str(self.process_groups[element]) + ", "
        return printstr
    #####################
    # Main set of methods
    #####################
    def add_process_groups(self, specs):
        """
        This function takes the specs (a list of jobs) and initiates each job as
        a process group.
        The process group abstracts the actual job into an object, providing a 
        single point of control and interaction for all the nodes within that job.
        Each job is described by a dict.  Each dict contains:
            size:  
            kernel: a String, the name of the kernel image to load.
            executable: A string, the name of the command to execute upon the
                head node; this could be considered the actual job's file.
            stdin, stdout, stderr:  Three separate strings, each containing
                the file to use for standard communication with the job as it
                is running.  May be specified, or False.
            kerneloptions: A string containing various options for the kernel,
                or False.
            args: A list
            umask: An integer
            jobid: An integer
            cobalt_log_file: A string containing the log file to use in the
                initiation and running of the job itself.
            location:  List of strings of node / resource names
            env:  A dict of key:value strings, specifying the environment in
                which the job is to run on the node
            id: A number
            mode:
            nodect:
            cwd:  A string, specifying the current working directory in which
                to run the job on the node
            walltime:  Integer; the time, in minutes, allocated for the job
                to run on the node.
            user:  A string, the name of the user under which this job is to run.
        """
        logstr = "System:add_process_groups:"
        LOGGER.debug( logstr + "Specs are %s" % specs )
        return self.process_groups.q_add(specs)
    add_process_groups = exposed(query(add_process_groups))
    
    
    def get_process_groups(self, specs):
        """get a list of existing allocations"""
        LOGGER.debug( "System:get_process_groups: specs are %s" % specs )
        self._wait()
        return self.process_groups.q_get(specs)
    get_process_groups = exposed(query(get_process_groups))
    
    
    def signal_process_groups(self, specs, sig):
        """Free the specified process group (set of allocated nodes)"""
        LOGGER.debug( 
        "System:signal_process_groups: Specs are %s, sig is %s"\
        % (specs, sig) )
        return self.process_groups.q_get(specs, lambda x, y:x.signal(y), sig)
    signal_process_groups = exposed(query(signal_process_groups))
    
    
    def wait_process_groups(self, specs):
        """Remove terminated process groups"""
        LOGGER.debug( "System:wait_process_groups; specs are %s" % specs )
        return self.process_groups.q_del(specs, lambda x, \
        _:self._release_resources(x))
    wait_process_groups = exposed(query(wait_process_groups))
    
    
    #########################################
    # Methods for dealing with Process Groups
    #########################################
    
    
    def _check_builds_done(self):
        """
        Check to see if the nodes are done building
        Starts the process group if all nodes in them are done building
        """
        #LOGGER.debug( "System:Check Build Done: Waiting to Start..." )
        #sleep(20)
        exstr = "System:check_build_done:"
        retval = True
        pg_list = [x for x in self.process_groups.itervalues()\
        if (len(x.pinging_nodes) > 0)]
        hiccup = HeckleConnector()
        for pgp in pg_list:
            for nodename in pgp.pinging_nodes:
                teststr = hiccup.get_node_bootstate(nodename)
                if teststr == "READY":
                    if 'fakebuild' in pgp.__dict__ and pgp.fakebuild:
                        pgp.pinging_nodes.remove(nodename)
                        LOGGER.debug( exstr + "Node %s done building; "\
                             + "%s pinging nodes left" %\
                             ( nodename, len(pgp.pinging_nodes)-1 ) )
                    else:
                        LOGGER.debug( exstr + "Node %s not done yet" %\
                                          nodename )
                if  teststr == "COMPLETED":
                    LOGGER.debug( exstr + 
                         "Removing node %s...%i pinging nodes left" \
                              % (nodename, len(pgp.pinging_nodes)-1) )
                    pgp.pinging_nodes.remove(nodename)
                elif teststr in ["BOOTING", "", ""]:
                    LOGGER.debug( exstr +
                    "Node %s not done yet." % nodename)
                elif teststr == "UNALLOCATED":
                    raise Exception( exstr +
        "Node 'UNALLOCATED'; Possible build error, or system timed out.")
                elif teststr == "CRITFAIL":
                    raise Exception( exstr +
                "Node says, 'CRITFAIL'.  It timed out while building.")
                #####################
                ####     Need to figure a better way to fail gracefully
                #####################
            if len(pgp.pinging_nodes) == 0:
                LOGGER.debug( 
    "System:Check Build Done: No Pinging Nodes left, Start PG %s Running." \
        % pgp.jobid)
                pgp.start()
            else:
                retval = False
        return retval
    _check_builds_done = automatic(_check_builds_done)
    
    
    def _wait(self):
        """
        Calls the process group container's wait() method
        """
        waitlen = len( self.process_groups.keys() )
        LOGGER.debug( "System:_wait:%s process groups." % waitlen )
        for pgp in self.process_groups.itervalues():
            pgp.wait()
            try:
                del( self.hacky_forbidden_nodes[pgp.location] )
            except:
                pass
    _wait = automatic(_wait)
    
    
    def _release_resources(self, pgp):
        """
        Releases all the Heckle nodes, unreserving them
        """
        LOGGER.debug( "System:release" )
        LOGGER.debug( "System:Locations are: %s" % pgp.location )
        hiccup = HeckleConnector()
        hiccup.free_reserved_node( uid = pgp.uid, node_list=pgp.location )
        try:
            del( self.hacky_forbidden_nodes[pgp.location] )
        except:
            pass
    
    
    def get_resources(self, specs=None ):
        """
        Returns a list of free resources (nodes) which match the given specs.
        Specs is a dict which describes a job
        """
        LOGGER.debug( "System:get Resources" )
        ##################################
        ###  Look at this as a future change
        ##################################
        hiccup = HeckleConnector()
        if not specs:
            return hiccup.node_list
        else:
            return hiccup.list_available_nodes( **specs )
    get_resources = exposed(query(get_resources))
    
    
    ##########################################################
    # Methods for interacting with scheduler and queue-manager
    ##########################################################
    
    
    def validate_job(self, spec):
        """
        Validates a job for submission
        -- will the job ever run under the current Heckle configuration?
        Steps:
            1)  Validate Kernel
            2)  Validate HW
            3)  Validate Job versus overall
        """
        LOGGER.debug( "System:Validate Job: Specs are %s" % spec )
        hiccup = HeckleConnector()
        try:
            kernel = spec['kernel']
            valid_kernel = hiccup.validkernel( kernel )
            if not valid_kernel:
                raise Exception("System:Validate Job: Bad Kernel")
        except:
            spec['kernel'] = 'default'
        try:
            valid_hw = hiccup.validhw( **spec['attrs'] )
            if not valid_hw:
                raise Exception(
                "System:Validate Job: Bad Hardware Specs: %s" % spec )
        except Exception as strec:
            raise Exception("System:Validate Job:  Validate Job: %s" % strec)
        #try:
            #valid_job = hiccup.validjob( **spec )
            #if not valid_job:
                #raise Exception(
                #"System: validate Job:  Never enough nodes")
        #except:
            #raise Exception("System: validate Job: Never enough nodes")
        return spec
    validate_job = exposed(validate_job)
    
    
    def verify_locations(self, location_list):
        """
        Makes sure a location list is valid
        location list is a list of fully qualified strings of node names
        ex:  nodename.mcs.anl.gov
        """
        LOGGER.debug("System:validate Job: Verify Locations")
        hiccup = HeckleConnector()
        heckle_set = set(hiccup.list_all_nodes())
        location_set = set(location_list)
        if heckle_set >= location_set:
            return location_list
        else:
            not_valid_list = list( location_set.difference( heckle_set ) )
            raise Exception(
    "System:VerifyLocations: Invalid location names: %s" % not_valid_list)
    verify_locations = exposed( verify_locations )
    
    
    def find_job_location(self, job_location_args, end_times):
        """
        Finds a group of not-busy nodes in which to run the job
        
        Arguments:
            job_location_args -- A list of dictionaries with info about the job
                jobid -- string identifier
                nodes -- int number of nodes
                queue -- string queue name
                required -- ??
                utility_score -- ??
                threshold -- ??
                walltime -- ??
                attrs -- dictionary of attributes to match against
            end_times -- supposed time the job will end
            
        Returns: Dictionary with list of nodes a job can run on, keyed by jobid
        """
        LOGGER.debug("System:find_job_location" )
        locations = {}
        def jobsort(job):
            """Used to sort job list by utility score"""
            return job["utility_score"]
        job_location_args.sort(key=jobsort)
        
        #Try to match jobs to nodes which can run them
        hiccup = HeckleConnector()
        for job in job_location_args:
            if "attrs" not in job or job["attrs"] is None:
                job["attrs"] = {}
            print "Job is %s" % job
            tempjob = job.copy()
            if self.hacky_forbidden_nodes:
                if 'forbidden' not in tempjob.keys():
                    tempjob['forbidden'] = self.hacky_forbidden_nodes
                else:
                    tempjob['forbidden'].extend( self.hacky_forbidden_nodes )
            #############################
            ###  Look at this as point of change
            ###  Think:  For node in unreserved nodes
            ###            Choose node from list
            ###            Remove node from unreserved nodes
            #############################
            try:
                resources = hiccup.find_job_location(**job)  #get matching nodes
                if not resources:
                    continue
            except Exception as err:
                LOGGER.info("System:find_job_location: Error %s" % err)
                continue
            node_list = []
            # Build a list of appropriate nodes
            for node in resources:
                node_list.append(node)
                self.hacky_forbidden_nodes.append(node)
            locations[job["jobid"]] = node_list
        LOGGER.info("System:find_job_location: locations are %s" % locations )
        return locations
    find_job_location = exposed(find_job_location)
    
    
    def find_queue_equivalence_classes(self, reservation_dict, \
                                                        active_queue_names):
        """
        Finds equivalent queues
        An equivalent queue is a queue which can run upon the same partition(s)
        For now, with one partition (everything!) this is irrelevant.
        Returns: equiv= [{'reservations': [], 'queues': ['default']}]
        """
        #LOGGER.debug("System:find queue equivalence classes" )
        equiv = []
        #print "Reservation_Dict is: %s" % reservation_dict
        #print "Active_queue_names is %s" % active_queue_names
        #print "Queue assignments are: %s" % self.queue_assignments
        for queue in self.queue_assignments:
            # skip queues that aren't running
            if not queue in active_queue_names:
                continue
            found_a_match = False
            #print "Heckle Queue is %s" % queue
            for equ in equiv:
                print "Heckle Equ is %s" % equ
                if equ['data'].intersection(self.queue_assignments[queue]):
                    equ['queues'].add(queue)
                    equ['data'].update(self.queue_assignments[queue])
                    found_a_match = True
                    break
            if not found_a_match:
                equiv.append({'queues': set([queue]),
                                'data': set(self.queue_assignments[queue]),
                                'reservations': set()})
        real_equiv = []
        for eq_class in equiv:
            found_a_match = False
            for equ in real_equiv:
                if equ['queues'].intersection(eq_class['queues']):
                    equ['queues'].update(eq_class['queues'])
                    equ['data'].update(eq_class['data'])
                    found_a_match = True
                    break
            if not found_a_match:
                real_equiv.append(eq_class)
        equiv = real_equiv
        for eq_class in equiv:
            for res_name in reservation_dict:
                for host_name in reservation_dict[res_name].split(":"):
                    if host_name in eq_class['data']:
                        eq_class['reservations'].add(res_name)
            for key in eq_class:
                eq_class[key] = list(eq_class[key])
            del eq_class['data']
        return equiv
    find_queue_equivalence_classes = exposed(find_queue_equivalence_classes)
    
    
    def get_partitions(self, locations):
        """
        Work-around to get the cqadm to run a single job on this system
        PRE:  locations is a list of dict of strings of possible node names
        POST:  if good, return locations
                if not good, raise exception and list bad nodes
        """
        logstr = "System:get_partition: "
        hiccup = HeckleConnector()
        heckle_node_set = set(hiccup.list_all_nodes())
        locs = locations[0]['name']
        LOGGER.debug( logstr + "raw is are: %s" % locations )
        LOGGER.debug( logstr + "vals are: %s" % locs )
        if type(locs) == ListType:
            locset = set(locs)
            badlocations = locset.difference(heckle_node_set)
            if badlocations:
                raise Exception(
        logstr + "Bad Locations: %s " % list(badlocations) )
        elif type(locs) == StringType:
            if locs not in locations:
                raise Exception( logstr + "Bad Locations: %s" % locs)
        else:
            raise Exception( logstr + 
"location needs to be string or list of strings, you provided %s : %s" \
% ( type(locs), locs))
        return locations
    get_partitions = exposed(get_partitions)
Esempio n. 2
0
class HeckleSystem(Component):
    """
    Cobalt System component for handling / interacting with Heckle resource manager
    
    External Methods:
        add_process_groups -- allocates nodes
        get_process_groups -- get process groups based on specs
        signal_process_groups -- signal a process group
        wait_process_groups -- removed process groups based on specs
        
    Internal Methods:
        __init__:
        _start_pg:
        _check_builds_done:
        _wait:
        _release_resources:
        get_resources:
        
    Queue Manager Methods:
        validate_job:
        verify_locations:
        find_job_locations:
        find_queue_equivalence_classes:
    """

    name = "system"
    implementation = "HeckleBreadboard"
    queue_assignments = {}

    def __init__(self, *args, **kwargs):
        Component.__init__(self, *args, **kwargs)
        self.process_groups = ProcessGroupDict()
        self.process_groups.item_cls = HeckleProcessGroup
        self.queue_assignments["default"] = self.get_resources()
        self.hacky_forbidden_nodes = [
        ]  #This is a temporary fix for the forbidden nodes issue

    def __repr__(self):
        """
        printout representation of the class
        """
        indict = self.__dict__
        printstr = ""
        printstr += "Heckle System Object: Values"
        for element in indict:
            printstr += str(element) + "::"
            if indict[element] == None:
                printstr += "None, "
            else:
                printstr += str(indict[element]) + ", "
        printstr += "   Process Groups:"
        for element in self.process_groups:
            printstr += str(element) + "::" + \
            str(self.process_groups[element]) + ", "
        return printstr

    #####################
    # Main set of methods
    #####################
    def add_process_groups(self, specs):
        """
        This function takes the specs (a list of jobs) and initiates each job as
        a process group.
        The process group abstracts the actual job into an object, providing a 
        single point of control and interaction for all the nodes within that job.
        Each job is described by a dict.  Each dict contains:
            size:  
            kernel: a String, the name of the kernel image to load.
            executable: A string, the name of the command to execute upon the
                head node; this could be considered the actual job's file.
            stdin, stdout, stderr:  Three separate strings, each containing
                the file to use for standard communication with the job as it
                is running.  May be specified, or False.
            kerneloptions: A string containing various options for the kernel,
                or False.
            args: A list
            umask: An integer
            jobid: An integer
            cobalt_log_file: A string containing the log file to use in the
                initiation and running of the job itself.
            location:  List of strings of node / resource names
            env:  A dict of key:value strings, specifying the environment in
                which the job is to run on the node
            id: A number
            mode:
            nodect:
            cwd:  A string, specifying the current working directory in which
                to run the job on the node
            walltime:  Integer; the time, in minutes, allocated for the job
                to run on the node.
            user:  A string, the name of the user under which this job is to run.
        """
        logstr = "System:add_process_groups:"
        LOGGER.debug(logstr + "Specs are %s" % specs)
        return self.process_groups.q_add(specs)

    add_process_groups = exposed(query(add_process_groups))

    def get_process_groups(self, specs):
        """get a list of existing allocations"""
        LOGGER.debug("System:get_process_groups: specs are %s" % specs)
        self._wait()
        return self.process_groups.q_get(specs)

    get_process_groups = exposed(query(get_process_groups))

    def signal_process_groups(self, specs, sig):
        """Free the specified process group (set of allocated nodes)"""
        LOGGER.debug(
        "System:signal_process_groups: Specs are %s, sig is %s"\
        % (specs, sig) )
        return self.process_groups.q_get(specs, lambda x, y: x.signal(y), sig)

    signal_process_groups = exposed(query(signal_process_groups))

    def wait_process_groups(self, specs):
        """Remove terminated process groups"""
        LOGGER.debug("System:wait_process_groups; specs are %s" % specs)
        return self.process_groups.q_del(specs, lambda x, \
        _:self._release_resources(x))

    wait_process_groups = exposed(query(wait_process_groups))

    #########################################
    # Methods for dealing with Process Groups
    #########################################

    def _check_builds_done(self):
        """
        Check to see if the nodes are done building
        Starts the process group if all nodes in them are done building
        """
        #LOGGER.debug( "System:Check Build Done: Waiting to Start..." )
        #sleep(20)
        exstr = "System:check_build_done:"
        retval = True
        pg_list = [x for x in self.process_groups.itervalues()\
        if (len(x.pinging_nodes) > 0)]
        hiccup = HeckleConnector()
        for pgp in pg_list:
            for nodename in pgp.pinging_nodes:
                teststr = hiccup.get_node_bootstate(nodename)
                if teststr == "READY":
                    if 'fakebuild' in pgp.__dict__ and pgp.fakebuild:
                        pgp.pinging_nodes.remove(nodename)
                        LOGGER.debug( exstr + "Node %s done building; "\
                             + "%s pinging nodes left" %\
                             ( nodename, len(pgp.pinging_nodes)-1 ) )
                    else:
                        LOGGER.debug( exstr + "Node %s not done yet" %\
                                          nodename )
                if teststr == "COMPLETED":
                    LOGGER.debug( exstr +
                         "Removing node %s...%i pinging nodes left" \
                              % (nodename, len(pgp.pinging_nodes)-1) )
                    pgp.pinging_nodes.remove(nodename)
                elif teststr in ["BOOTING", "", ""]:
                    LOGGER.debug(exstr + "Node %s not done yet." % nodename)
                elif teststr == "UNALLOCATED":
                    raise Exception(
                        exstr +
                        "Node 'UNALLOCATED'; Possible build error, or system timed out."
                    )
                elif teststr == "CRITFAIL":
                    raise Exception(
                        exstr +
                        "Node says, 'CRITFAIL'.  It timed out while building.")
                #####################
                ####     Need to figure a better way to fail gracefully
                #####################
            if len(pgp.pinging_nodes) == 0:
                LOGGER.debug(
                "System:Check Build Done: No Pinging Nodes left, Start PG %s Running." \
        % pgp.jobid)
                pgp.start()
            else:
                retval = False
        return retval

    _check_builds_done = automatic(_check_builds_done)

    def _wait(self):
        """
        Calls the process group container's wait() method
        """
        waitlen = len(self.process_groups.keys())
        LOGGER.debug("System:_wait:%s process groups." % waitlen)
        for pgp in self.process_groups.itervalues():
            pgp.wait()
            try:
                del (self.hacky_forbidden_nodes[pgp.location])
            except:
                pass

    _wait = automatic(_wait)

    def _release_resources(self, pgp):
        """
        Releases all the Heckle nodes, unreserving them
        """
        LOGGER.debug("System:release")
        LOGGER.debug("System:Locations are: %s" % pgp.location)
        hiccup = HeckleConnector()
        hiccup.free_reserved_node(uid=pgp.uid, node_list=pgp.location)
        try:
            del (self.hacky_forbidden_nodes[pgp.location])
        except:
            pass

    def get_resources(self, specs=None):
        """
        Returns a list of free resources (nodes) which match the given specs.
        Specs is a dict which describes a job
        """
        LOGGER.debug("System:get Resources")
        ##################################
        ###  Look at this as a future change
        ##################################
        hiccup = HeckleConnector()
        if not specs:
            return hiccup.node_list
        else:
            return hiccup.list_available_nodes(**specs)

    get_resources = exposed(query(get_resources))

    ##########################################################
    # Methods for interacting with scheduler and queue-manager
    ##########################################################

    def validate_job(self, spec):
        """
        Validates a job for submission
        -- will the job ever run under the current Heckle configuration?
        Steps:
            1)  Validate Kernel
            2)  Validate HW
            3)  Validate Job versus overall
        """
        LOGGER.debug("System:Validate Job: Specs are %s" % spec)
        hiccup = HeckleConnector()
        try:
            kernel = spec['kernel']
            valid_kernel = hiccup.validkernel(kernel)
            if not valid_kernel:
                raise Exception("System:Validate Job: Bad Kernel")
        except:
            spec['kernel'] = 'default'
        try:
            valid_hw = hiccup.validhw(**spec['attrs'])
            if not valid_hw:
                raise Exception("System:Validate Job: Bad Hardware Specs: %s" %
                                spec)
        except Exception as strec:
            raise Exception("System:Validate Job:  Validate Job: %s" % strec)
        #try:
        #valid_job = hiccup.validjob( **spec )
        #if not valid_job:
        #raise Exception(
        #"System: validate Job:  Never enough nodes")
        #except:
        #raise Exception("System: validate Job: Never enough nodes")
        return spec

    validate_job = exposed(validate_job)

    def verify_locations(self, location_list):
        """
        Makes sure a location list is valid
        location list is a list of fully qualified strings of node names
        ex:  nodename.mcs.anl.gov
        """
        LOGGER.debug("System:validate Job: Verify Locations")
        hiccup = HeckleConnector()
        heckle_set = set(hiccup.list_all_nodes())
        location_set = set(location_list)
        if heckle_set >= location_set:
            return location_list
        else:
            not_valid_list = list(location_set.difference(heckle_set))
            raise Exception(
                "System:VerifyLocations: Invalid location names: %s" %
                not_valid_list)

    verify_locations = exposed(verify_locations)

    def find_job_location(self, job_location_args, end_times):
        """
        Finds a group of not-busy nodes in which to run the job
        
        Arguments:
            job_location_args -- A list of dictionaries with info about the job
                jobid -- string identifier
                nodes -- int number of nodes
                queue -- string queue name
                required -- ??
                utility_score -- ??
                threshold -- ??
                walltime -- ??
                attrs -- dictionary of attributes to match against
            end_times -- supposed time the job will end
            
        Returns: Dictionary with list of nodes a job can run on, keyed by jobid
        """
        LOGGER.debug("System:find_job_location")
        locations = {}

        def jobsort(job):
            """Used to sort job list by utility score"""
            return job["utility_score"]

        job_location_args.sort(key=jobsort)

        #Try to match jobs to nodes which can run them
        hiccup = HeckleConnector()
        for job in job_location_args:
            if "attrs" not in job or job["attrs"] is None:
                job["attrs"] = {}
            print "Job is %s" % job
            tempjob = job.copy()
            if self.hacky_forbidden_nodes:
                if 'forbidden' not in tempjob.keys():
                    tempjob['forbidden'] = self.hacky_forbidden_nodes
                else:
                    tempjob['forbidden'].extend(self.hacky_forbidden_nodes)
            #############################
            ###  Look at this as point of change
            ###  Think:  For node in unreserved nodes
            ###            Choose node from list
            ###            Remove node from unreserved nodes
            #############################
            try:
                resources = hiccup.find_job_location(**
                                                     job)  #get matching nodes
                if not resources:
                    continue
            except Exception as err:
                LOGGER.info("System:find_job_location: Error %s" % err)
                continue
            node_list = []
            # Build a list of appropriate nodes
            for node in resources:
                node_list.append(node)
                self.hacky_forbidden_nodes.append(node)
            locations[job["jobid"]] = node_list
        LOGGER.info("System:find_job_location: locations are %s" % locations)
        return locations

    find_job_location = exposed(find_job_location)


    def find_queue_equivalence_classes(self, reservation_dict, \
                                                        active_queue_names):
        """
        Finds equivalent queues
        An equivalent queue is a queue which can run upon the same partition(s)
        For now, with one partition (everything!) this is irrelevant.
        Returns: equiv= [{'reservations': [], 'queues': ['default']}]
        """
        #LOGGER.debug("System:find queue equivalence classes" )
        equiv = []
        #print "Reservation_Dict is: %s" % reservation_dict
        #print "Active_queue_names is %s" % active_queue_names
        #print "Queue assignments are: %s" % self.queue_assignments
        for queue in self.queue_assignments:
            # skip queues that aren't running
            if not queue in active_queue_names:
                continue
            found_a_match = False
            #print "Heckle Queue is %s" % queue
            for equ in equiv:
                print "Heckle Equ is %s" % equ
                if equ['data'].intersection(self.queue_assignments[queue]):
                    equ['queues'].add(queue)
                    equ['data'].update(self.queue_assignments[queue])
                    found_a_match = True
                    break
            if not found_a_match:
                equiv.append({
                    'queues': set([queue]),
                    'data': set(self.queue_assignments[queue]),
                    'reservations': set()
                })
        real_equiv = []
        for eq_class in equiv:
            found_a_match = False
            for equ in real_equiv:
                if equ['queues'].intersection(eq_class['queues']):
                    equ['queues'].update(eq_class['queues'])
                    equ['data'].update(eq_class['data'])
                    found_a_match = True
                    break
            if not found_a_match:
                real_equiv.append(eq_class)
        equiv = real_equiv
        for eq_class in equiv:
            for res_name in reservation_dict:
                for host_name in reservation_dict[res_name].split(":"):
                    if host_name in eq_class['data']:
                        eq_class['reservations'].add(res_name)
            for key in eq_class:
                eq_class[key] = list(eq_class[key])
            del eq_class['data']
        return equiv

    find_queue_equivalence_classes = exposed(find_queue_equivalence_classes)

    def get_partitions(self, locations):
        """
        Work-around to get the cqadm to run a single job on this system
        PRE:  locations is a list of dict of strings of possible node names
        POST:  if good, return locations
                if not good, raise exception and list bad nodes
        """
        logstr = "System:get_partition: "
        hiccup = HeckleConnector()
        heckle_node_set = set(hiccup.list_all_nodes())
        locs = locations[0]['name']
        LOGGER.debug(logstr + "raw is are: %s" % locations)
        LOGGER.debug(logstr + "vals are: %s" % locs)
        if type(locs) == ListType:
            locset = set(locs)
            badlocations = locset.difference(heckle_node_set)
            if badlocations:
                raise Exception(logstr +
                                "Bad Locations: %s " % list(badlocations))
        elif type(locs) == StringType:
            if locs not in locations:
                raise Exception(logstr + "Bad Locations: %s" % locs)
        else:
            raise Exception( logstr +
                             "location needs to be string or list of strings, you provided %s : %s" \
% ( type(locs), locs))
        return locations

    get_partitions = exposed(get_partitions)