Beispiel #1
0
 def __init__(self, *args, **kwargs):
     Component.__init__(self, *args, **kwargs)
     self.resources = ResourceDict()
     self.process_groups = ProcessGroupDict()
     self.process_groups.item_cls = BBProcessGroup
     self.queue_assignments = {}
     self.queue_assignments["default"] = sets.Set(self.resources)
Beispiel #2
0
 def __init__(self, *args, **kwargs):
     Component.__init__(self, *args, **kwargs)
     self.resources = ResourceDict()
     self.process_groups = ProcessGroupDict()
     self.process_groups.item_cls = BBProcessGroup
     self.queue_assignments = {}
     self.queue_assignments["default"] = sets.Set(self.resources)
Beispiel #3
0
class BBSystem(Component):
    """Breadboard system component.

    Methods:
    add_process_groups -- allocates nodes
    get_process_groups -- get process groups based on specs
    signal_process_groups -- signal a process group
    wait_process_groups -- removed process groups based on specs
    """

    name = "system"
    implementation = "Breadboard"

    def __init__(self, *args, **kwargs):
        Component.__init__(self, *args, **kwargs)
        self.resources = ResourceDict()
        self.process_groups = ProcessGroupDict()
        self.process_groups.item_cls = BBProcessGroup
        self.queue_assignments = {}
        self.queue_assignments["default"] = sets.Set(self.resources)

    #####################
    # Main set of methods
    #####################
    def add_process_groups(self, specs):
        """Allocate nodes and add the list of those allocated to the PGDict"""
        return self.process_groups.q_add(specs, lambda x, _:self._start_pg(x))
    add_process_groups = exposed(query(add_process_groups))

    def get_process_groups(self, specs):
        """Get a list of existing allocations"""
        self._wait()
        return self.process_groups.q_get(specs)
    get_process_groups = exposed(query(get_process_groups))

    def signal_process_groups(self, specs, sig):
        """Free the specified process group (set of allocated nodes)"""
        return self.process_groups.q_get(specs, lambda x, y:x.signal(y),
                                         sig)
    signal_process_groups = exposed(query(signal_process_groups))

    def wait_process_groups(self, specs):
        """Remove terminated process groups"""
        return self.process_groups.q_del(specs, lambda x,
                                         _:self._release_resources(x))
    wait_process_groups = exposed(query(wait_process_groups))

    #########################################
    # Methods for dealing with Process Groups
    #########################################
    def _start_pg(self, pgp):
        """Starts a process group by initiating building/rebooting nodes"""

        ###########################################
        ### The following is for back-compatibility
        ### with bballoc (bbtools) until breadboard
        ### is switched entirely to run on cobalt
        ###########################################
        bbdata = bblib.BBConfig("/etc/bb.xml")
        bbdata.SetNodeAttr(pgp.location, {"user":pgp.user, "state":"Cobalt",
                                          "comment":"Managed by Cobalt"})
        bbdata.WriteAndClose()
        ###########################################
        ### End of back-compatibility
        ###########################################

        specs = [{"name":name, "attributes":"*"} for name in pgp.location]
        resources = self.get_resources(specs)
        action = "build-%s" % pgp.kernel
        for res in resources:
            # Set build action for each resource
            specs = [{"name":res.name}]
            new_attrs = {"attributes":{"action":action}}
            self.set_attributes(specs, new_attrs)
            mac = res.attributes["mac"]
            linkname = "/tftpboot/pxelinux.cfg/01-%s" \
                % mac.replace(":", "-").lower()
            if os.readlink(linkname) == action:
                continue
            os.unlink(linkname)
            os.symlink(action, linkname)
        for res in resources:
            # Cycle power
            os.system("/usr/sbin/pm -c %s" % res.name)
            # Add resource to list of building nodes
            pgp.building_nodes.append(res.name)

    def _check_builds_done(self):
        """Checks if nodes are done building for each process group and
        scripts can begin running"""
        for pgp in [x for x in self.process_groups.itervalues() if
                    (len(x.building_nodes) > 0 or len(x.pinging_nodes) > 0)]:
            specs = [{"name":name, "attributes":"*"}
                     for name in pgp.building_nodes]
            building = self.get_resources(specs)
            build_action = "build-%s" % pgp.kernel
            for node in building:
                if node.attributes["action"] != build_action:
                    pgp.building_nodes.remove(node.name)
                    pgp.pinging_nodes.append(node.name)
            for nodename in pgp.pinging_nodes:
                if os.system("/bin/ping -c 1 -W 1 %s > /dev/null"
                             % nodename):
                    continue
                pgp.pinging_nodes.remove(nodename)
            if len(pgp.building_nodes) == 0 and len(pgp.pinging_nodes) == 0:
                pgp.start()
    _check_builds_done = automatic(_check_builds_done)

    def node_done_building(self, node):
        """Sets a node as done building
        
        Arguments:
        node -- string name of node that is done building

        Returns: nothing
        """
        specs = [{"name":node, "attributes":"*"}]
        nodedata = self.get_resources(specs)
        if len(nodedata) > 0:
            buildimage = nodedata[0].attributes["action"]
            nodedata[0].attributes["action"] = buildimage.replace("build-",
                                                                  "boot-")
    node_done_building = exposed(node_done_building)

    def _wait(self):
        """Calls the process group container's wait() method"""
        for pgp in self.process_groups.itervalues():
            pgp.wait()
    _wait = automatic(_wait)

    def _release_resources(self, pgp):
        """Releases the resources held by a process group"""
        os.system("/usr/sbin/pm -0 %s" % " ".join(pgp.location))
        specs = [{"name":name} for name in pgp.location]
        new_attrs = {"state":"idle"}
        self.set_attributes(specs, new_attrs)

        ###########################################
        ### The following is for back-compatibility
        ### with bballoc (bbtools) until breadboard
        ### is switched entirely to run on cobalt
        ###########################################
        bbdata = bblib.BBConfig("/etc/bb.xml")
        bbdata.SetNodeAttr(pgp.location, {"user":"******"})
        bbdata.WriteAndClose()
        ###########################################
        ### End of back-compatibility
        ###########################################

    ####################################
    # Methods for dealing with resources
    ####################################
    def add_resources(self, specs):
        """Add a resource to this system
        
        Arguments:
        specs -- A list of dictionaries with the attributes for the resources
        
        Returns: list of values added
        """
        try:
            ret = self.resources.q_add(specs)
            for res in ret:
                self.queue_assignments["default"].add(res)
        except KeyError:
            ret = "KeyError"
        return ret
    add_resources = exposed(query(add_resources))

    def remove_resources(self, specs):
        """Remove a resource from this system
        
        Arguments:
        specs -- A list of dictionaries with the attributes to pick which
                 resources to remove

        Returns: list of resources removed
        """
        ret = self.resources.q_del(specs)
        for res in ret:
            self.queue_assignments["default"].discard(res)
        return ret
    remove_resources = exposed(remove_resources)

    def get_resources(self, specs):
        """Returns a list of all the resources for this system matching the
        given specs (list of dictionaries)"""
        return self.resources.q_get(specs)
    get_resources = exposed(query(get_resources))

    def set_attributes(self, specs, newattrs):
        """Sets an attribute in specified resources

        Arguments:
        specs -- list of dictionaries with resource attributes to match
        newattrs -- a dictionary with key:val pairs of attributes to set

        Returns: a list of the changed resources
        """
        return self.resources.q_get(specs,
                                    lambda x, y:[set_attr(x, key, val)
                                                 for key, val in y.iteritems()],
                                    newattrs)
    set_attributes = exposed(query(set_attributes))

    def remove_attributes(self, specs, attrs):
        """Removes other attributes in specified resources

        Arguments:
        specs -- list of dictionaries with resource attributes to match
        attrs -- list of names of attributes to remove from resource.attributes

        Returns: a list of the changed resources
        """
        return self.resources.q_get(specs, lambda x, y:[rem_attr(x, key)
                                                        for key in y], attrs)
    remove_attributes = exposed(query(remove_attributes))

    ##########################################################
    # Methods for interacting with scheduler and queue-manager
    ##########################################################
    def validate_job(self, spec):
        """Validate a job for submission

        Arguments:
        spec -- job specification dictionary
        """
        max_nodes = len(self.get_resources([{"name":"*", "functional":True,
                                             "scheduled":True}]))
        try:
            spec["nodecount"] = int(spec["nodecount"])
        except ValueError:
            raise JobValidationError("Non-integer node count")
        if not 0 < spec["nodecount"] <= max_nodes:
            raise JobValidationError("Node count out of realistic range")
        if float(spec["time"]) < 15:
            raise JobValidationError("Walltime less than minimum 15 minutes")
        if "kernel" in spec:
            if not (os.path.exists("/tftpboot/pxelinux.cfg/build-%s" %
                                   spec["kernel"]) and 
                    os.path.exists("/tftpboot/pxelinux.cfg/boot-%s" %
                                   spec["kernel"])):
                raise JobValidationError(("Specified image %s (from -k " +
                                         "'kernel' flag does not exist")
                                         % spec["kernel"])
        if "attrs" in spec:
            matched_res = self.resources.get_attr_matched_resources(
                [{"name":"*", "functional":True, "scheduled":True,
                  "attributes":"*"}],
                spec["attrs"])
            if spec["nodecount"] > len(matched_res):
                raise JobValidationError("Not enough nodes exist with the " +
                                         "attributes to match")
        return spec
    validate_job = exposed(validate_job)

    def verify_locations(self, location_list):
        """Makes sure a 'location string' is valid"""
        resources = self.get_resources([{"name":r} for r in location_list])
        return [r.name for r in resources]
    verify_locations = exposed(verify_locations)

    def find_job_location(self, job_location_args, end_times):
        """Finds and reserves a list of nodes in which the job can run
        
        Arguments:
        job_location_args -- A list of dictionaries with info about the job
            jobid -- string identifier
            nodes -- int number of nodes
            queue -- string queue name
            required -- ??
            utility_score -- ??
            threshold -- ??
            walltime -- ??
            attrs -- dictionary of attributes to match against
        end_times -- supposed time the job will end

        Returns: Dictionary with list of nodes a job can run on, keyed by jobid
        """
        locations = {}
        def jobsort(job):
            """Used to sort job list by utility score"""
            return job["utility_score"]
        job_location_args.sort(key=jobsort)
        for job in job_location_args:
            specs = [{"name":"*", "functional":True, "scheduled":True,
                      "state":"idle", "attributes":"*"}]
            if "attrs" not in job or job["attrs"] is None:
                job["attrs"] = {}
            resources = self.resources.get_attr_matched_resources(specs,
                                                                  job["attrs"])
            if len(resources) < job["nodes"]:
                #Can't schedule job - not enough resources
                continue
            def namesort(res):
                """Used to sort resources by name"""
                return res.name
            resources.sort(key=namesort)
            used_resources = resources[:job["nodes"]]
            for res in used_resources:
                res.state = "busy"
            locations[job["jobid"]] = [r.name for r in used_resources]
        return locations
    find_job_location = exposed(find_job_location)

    def find_queue_equivalence_classes(self, reservation_dict, 
                                       active_queue_names):
        """Finds equivalent queues"""
        equiv = []
        for queue in self.queue_assignments:
            # skip queues that aren't running
            if not queue in active_queue_names:
                continue
            found_a_match = False
            for equ in equiv:
                if equ['data'].intersection(self.queue_assignments[queue]):
                    equ['queues'].add(queue)
                    equ['data'].update(self.queue_assignments[queue])
                    found_a_match = True
                    break
            if not found_a_match:
                equiv.append({'queues': set([queue]),
                              'data': set(self.queue_assignments[queue]),
                              'reservations': set()})
        real_equiv = []
        for eq_class in equiv:
            found_a_match = False
            for equ in real_equiv:
                if equ['queues'].intersection(eq_class['queues']):
                    equ['queues'].update(eq_class['queues'])
                    equ['data'].update(eq_class['data'])
                    found_a_match = True
                    break
            if not found_a_match:
                real_equiv.append(eq_class)
        equiv = real_equiv
        for eq_class in equiv:
            for res_name in reservation_dict:
                for host_name in reservation_dict[res_name].split(":"):
                    if host_name in eq_class['data']:
                        eq_class['reservations'].add(res_name)
            for key in eq_class:
                eq_class[key] = list(eq_class[key])
            del eq_class['data']
        return equiv
    find_queue_equivalence_classes = exposed(find_queue_equivalence_classes)
Beispiel #4
0
class BBSystem(Component):
    """Breadboard system component.

    Methods:
    add_process_groups -- allocates nodes
    get_process_groups -- get process groups based on specs
    signal_process_groups -- signal a process group
    wait_process_groups -- removed process groups based on specs
    """

    name = "system"
    implementation = "Breadboard"

    def __init__(self, *args, **kwargs):
        Component.__init__(self, *args, **kwargs)
        self.resources = ResourceDict()
        self.process_groups = ProcessGroupDict()
        self.process_groups.item_cls = BBProcessGroup
        self.queue_assignments = {}
        self.queue_assignments["default"] = sets.Set(self.resources)

    #####################
    # Main set of methods
    #####################
    def add_process_groups(self, specs):
        """Allocate nodes and add the list of those allocated to the PGDict"""
        return self.process_groups.q_add(specs, lambda x, _: self._start_pg(x))

    add_process_groups = exposed(query(add_process_groups))

    def get_process_groups(self, specs):
        """Get a list of existing allocations"""
        self._wait()
        return self.process_groups.q_get(specs)

    get_process_groups = exposed(query(get_process_groups))

    def signal_process_groups(self, specs, sig):
        """Free the specified process group (set of allocated nodes)"""
        return self.process_groups.q_get(specs, lambda x, y: x.signal(y), sig)

    signal_process_groups = exposed(query(signal_process_groups))

    def wait_process_groups(self, specs):
        """Remove terminated process groups"""
        return self.process_groups.q_del(
            specs, lambda x, _: self._release_resources(x))

    wait_process_groups = exposed(query(wait_process_groups))

    #########################################
    # Methods for dealing with Process Groups
    #########################################
    def _start_pg(self, pgp):
        """Starts a process group by initiating building/rebooting nodes"""

        ###########################################
        ### The following is for back-compatibility
        ### with bballoc (bbtools) until breadboard
        ### is switched entirely to run on cobalt
        ###########################################
        bbdata = bblib.BBConfig("/etc/bb.xml")
        bbdata.SetNodeAttr(pgp.location, {
            "user": pgp.user,
            "state": "Cobalt",
            "comment": "Managed by Cobalt"
        })
        bbdata.WriteAndClose()
        ###########################################
        ### End of back-compatibility
        ###########################################

        specs = [{"name": name, "attributes": "*"} for name in pgp.location]
        resources = self.get_resources(specs)
        action = "build-%s" % pgp.kernel
        for res in resources:
            # Set build action for each resource
            specs = [{"name": res.name}]
            new_attrs = {"attributes": {"action": action}}
            self.set_attributes(specs, new_attrs)
            mac = res.attributes["mac"]
            linkname = "/tftpboot/pxelinux.cfg/01-%s" \
                % mac.replace(":", "-").lower()
            if os.readlink(linkname) == action:
                continue
            os.unlink(linkname)
            os.symlink(action, linkname)
        for res in resources:
            # Cycle power
            os.system("/usr/sbin/pm -c %s" % res.name)
            # Add resource to list of building nodes
            pgp.building_nodes.append(res.name)

    def _check_builds_done(self):
        """Checks if nodes are done building for each process group and
        scripts can begin running"""
        for pgp in [
                x for x in self.process_groups.itervalues()
                if (len(x.building_nodes) > 0 or len(x.pinging_nodes) > 0)
        ]:
            specs = [{
                "name": name,
                "attributes": "*"
            } for name in pgp.building_nodes]
            building = self.get_resources(specs)
            build_action = "build-%s" % pgp.kernel
            for node in building:
                if node.attributes["action"] != build_action:
                    pgp.building_nodes.remove(node.name)
                    pgp.pinging_nodes.append(node.name)
            for nodename in pgp.pinging_nodes:
                if os.system("/bin/ping -c 1 -W 1 %s > /dev/null" % nodename):
                    continue
                pgp.pinging_nodes.remove(nodename)
            if len(pgp.building_nodes) == 0 and len(pgp.pinging_nodes) == 0:
                pgp.start()

    _check_builds_done = automatic(_check_builds_done)

    def node_done_building(self, node):
        """Sets a node as done building
        
        Arguments:
        node -- string name of node that is done building

        Returns: nothing
        """
        specs = [{"name": node, "attributes": "*"}]
        nodedata = self.get_resources(specs)
        if len(nodedata) > 0:
            buildimage = nodedata[0].attributes["action"]
            nodedata[0].attributes["action"] = buildimage.replace(
                "build-", "boot-")

    node_done_building = exposed(node_done_building)

    def _wait(self):
        """Calls the process group container's wait() method"""
        for pgp in self.process_groups.itervalues():
            pgp.wait()

    _wait = automatic(_wait)

    def _release_resources(self, pgp):
        """Releases the resources held by a process group"""
        os.system("/usr/sbin/pm -0 %s" % " ".join(pgp.location))
        specs = [{"name": name} for name in pgp.location]
        new_attrs = {"state": "idle"}
        self.set_attributes(specs, new_attrs)

        ###########################################
        ### The following is for back-compatibility
        ### with bballoc (bbtools) until breadboard
        ### is switched entirely to run on cobalt
        ###########################################
        bbdata = bblib.BBConfig("/etc/bb.xml")
        bbdata.SetNodeAttr(pgp.location, {"user": "******"})
        bbdata.WriteAndClose()
        ###########################################
        ### End of back-compatibility
        ###########################################

    ####################################
    # Methods for dealing with resources
    ####################################
    def add_resources(self, specs):
        """Add a resource to this system
        
        Arguments:
        specs -- A list of dictionaries with the attributes for the resources
        
        Returns: list of values added
        """
        try:
            ret = self.resources.q_add(specs)
            for res in ret:
                self.queue_assignments["default"].add(res)
        except KeyError:
            ret = "KeyError"
        return ret

    add_resources = exposed(query(add_resources))

    def remove_resources(self, specs):
        """Remove a resource from this system
        
        Arguments:
        specs -- A list of dictionaries with the attributes to pick which
                 resources to remove

        Returns: list of resources removed
        """
        ret = self.resources.q_del(specs)
        for res in ret:
            self.queue_assignments["default"].discard(res)
        return ret

    remove_resources = exposed(remove_resources)

    def get_resources(self, specs):
        """Returns a list of all the resources for this system matching the
        given specs (list of dictionaries)"""
        return self.resources.q_get(specs)

    get_resources = exposed(query(get_resources))

    def set_attributes(self, specs, newattrs):
        """Sets an attribute in specified resources

        Arguments:
        specs -- list of dictionaries with resource attributes to match
        newattrs -- a dictionary with key:val pairs of attributes to set

        Returns: a list of the changed resources
        """
        return self.resources.q_get(
            specs,
            lambda x, y: [set_attr(x, key, val) for key, val in y.iteritems()],
            newattrs)

    set_attributes = exposed(query(set_attributes))

    def remove_attributes(self, specs, attrs):
        """Removes other attributes in specified resources

        Arguments:
        specs -- list of dictionaries with resource attributes to match
        attrs -- list of names of attributes to remove from resource.attributes

        Returns: a list of the changed resources
        """
        return self.resources.q_get(
            specs, lambda x, y: [rem_attr(x, key) for key in y], attrs)

    remove_attributes = exposed(query(remove_attributes))

    ##########################################################
    # Methods for interacting with scheduler and queue-manager
    ##########################################################
    def validate_job(self, spec):
        """Validate a job for submission

        Arguments:
        spec -- job specification dictionary
        """
        max_nodes = len(
            self.get_resources([{
                "name": "*",
                "functional": True,
                "scheduled": True
            }]))
        try:
            spec["nodecount"] = int(spec["nodecount"])
        except ValueError:
            raise JobValidationError("Non-integer node count")
        if not 0 < spec["nodecount"] <= max_nodes:
            raise JobValidationError("Node count out of realistic range")
        if float(spec["time"]) < 15:
            raise JobValidationError("Walltime less than minimum 15 minutes")
        if "kernel" in spec:
            if not (os.path.exists(
                    "/tftpboot/pxelinux.cfg/build-%s" % spec["kernel"])
                    and os.path.exists(
                        "/tftpboot/pxelinux.cfg/boot-%s" % spec["kernel"])):
                raise JobValidationError(
                    ("Specified image %s (from -k " +
                     "'kernel' flag does not exist") % spec["kernel"])
        if "attrs" in spec:
            matched_res = self.resources.get_attr_matched_resources(
                [{
                    "name": "*",
                    "functional": True,
                    "scheduled": True,
                    "attributes": "*"
                }], spec["attrs"])
            if spec["nodecount"] > len(matched_res):
                raise JobValidationError("Not enough nodes exist with the " +
                                         "attributes to match")
        return spec

    validate_job = exposed(validate_job)

    def verify_locations(self, location_list):
        """Makes sure a 'location string' is valid"""
        resources = self.get_resources([{"name": r} for r in location_list])
        return [r.name for r in resources]

    verify_locations = exposed(verify_locations)

    def find_job_location(self, job_location_args, end_times):
        """Finds and reserves a list of nodes in which the job can run
        
        Arguments:
        job_location_args -- A list of dictionaries with info about the job
            jobid -- string identifier
            nodes -- int number of nodes
            queue -- string queue name
            required -- ??
            utility_score -- ??
            threshold -- ??
            walltime -- ??
            attrs -- dictionary of attributes to match against
        end_times -- supposed time the job will end

        Returns: Dictionary with list of nodes a job can run on, keyed by jobid
        """
        locations = {}

        def jobsort(job):
            """Used to sort job list by utility score"""
            return job["utility_score"]

        job_location_args.sort(key=jobsort)
        for job in job_location_args:
            specs = [{
                "name": "*",
                "functional": True,
                "scheduled": True,
                "state": "idle",
                "attributes": "*"
            }]
            if "attrs" not in job or job["attrs"] is None:
                job["attrs"] = {}
            resources = self.resources.get_attr_matched_resources(
                specs, job["attrs"])
            if len(resources) < job["nodes"]:
                #Can't schedule job - not enough resources
                continue

            def namesort(res):
                """Used to sort resources by name"""
                return res.name

            resources.sort(key=namesort)
            used_resources = resources[:job["nodes"]]
            for res in used_resources:
                res.state = "busy"
            locations[job["jobid"]] = [r.name for r in used_resources]
        return locations

    find_job_location = exposed(find_job_location)

    def find_queue_equivalence_classes(self, reservation_dict,
                                       active_queue_names):
        """Finds equivalent queues"""
        equiv = []
        for queue in self.queue_assignments:
            # skip queues that aren't running
            if not queue in active_queue_names:
                continue
            found_a_match = False
            for equ in equiv:
                if equ['data'].intersection(self.queue_assignments[queue]):
                    equ['queues'].add(queue)
                    equ['data'].update(self.queue_assignments[queue])
                    found_a_match = True
                    break
            if not found_a_match:
                equiv.append({
                    'queues': set([queue]),
                    'data': set(self.queue_assignments[queue]),
                    'reservations': set()
                })
        real_equiv = []
        for eq_class in equiv:
            found_a_match = False
            for equ in real_equiv:
                if equ['queues'].intersection(eq_class['queues']):
                    equ['queues'].update(eq_class['queues'])
                    equ['data'].update(eq_class['data'])
                    found_a_match = True
                    break
            if not found_a_match:
                real_equiv.append(eq_class)
        equiv = real_equiv
        for eq_class in equiv:
            for res_name in reservation_dict:
                for host_name in reservation_dict[res_name].split(":"):
                    if host_name in eq_class['data']:
                        eq_class['reservations'].add(res_name)
            for key in eq_class:
                eq_class[key] = list(eq_class[key])
            del eq_class['data']
        return equiv

    find_queue_equivalence_classes = exposed(find_queue_equivalence_classes)