Ejemplo n.º 1
0
    def fetch_nodearray_definitions(self):
        '''
            A wrapper around the autoscale library function to parse Configuration.autoscale.* chef attributes and add
            the 'ungrouped' attribute to the machine types.
            
            See cyclecloud.nodearrays.NodearrayDefinitions for more info.
        '''
        nodearray_definitions = machine.fetch_nodearray_definitions(
            self.clusters_api, self.default_placement_attrs)
        nodearray_definitions.placement_group_optional = True

        for machinetype in nodearray_definitions:
            # ensure that any custom attribute the user specified, like disk = 100G, gets parsed correctly
            for key, value in machinetype.iteritems():
                try:
                    machinetype[key] = pbscc.parse_gb_size(key, value)
                except InvalidSizeExpressionError:
                    pass

            # kludge: there is a strange bug where ungrouped is showing up as a string and not a boolean.
            if not machinetype.get("group_id"):
                machinetype["ungrouped"] = "true"
            else:
                machinetype["ungrouped"] = "false"
                machinetype["group_id"] = str(
                    autoscale_util.uuid("ungrouped-"))

        return nodearray_definitions
    def process_pbsnode(self, pbsnode, instance_ids_to_shutdown, nodearray_definitions):
        '''
            If the pbsnode is offline, will handle evaluating whether the node can be shutdown. See instance_ids_to_shutdown, which
            is an OUT parameter here.
            
            Otherwise convert the pbsnode into a cyclecloud.machine.Machine instance.
        '''
        
        states = set(pbsnode["state"].split(","))
        resources = pbsnode["resources_available"]
        # host has incorrect case
        hostname = resources["vnode"]
        
        instance_id = resources.get("instance_id", autoscale_util.uuid("instanceid"))
        
        def try_shutdown_pbsnode():
            if not instance_id:
                pbscc.error("instance_id was not defined for host %s, can not shut it down" % hostname)
            elif "down" in states:
                # don't immediately remove down nodes, give them time to recover from network failure.
                remove_down_nodes = float(self.cc_config.get("pbspro.remove_down_nodes", 300))
                since_down = self.clock.time() - pbsnode["last_state_change_time"]
                if since_down > remove_down_nodes:
                    pbscc.error("Removing down node %s after %.0f seconds", hostname, since_down)
                    instance_ids_to_shutdown[instance_id] = hostname
                    return True
                else:
                    omega = remove_down_nodes - since_down
                    pbscc.warn("Not removing down node %s for another %.0f seconds", hostname, omega)
            else:
                instance_ids_to_shutdown[instance_id] = hostname
                return True
            
            return False
        
        if "offline" in states:
            if not pbsnode.get("jobs", []):
                pbscc.fine("%s is offline and has no jobs, may be able to shut down" % hostname)
                if try_shutdown_pbsnode():
                    return
            else:
                pbscc.fine("Host %s is offline but still running jobs" % hostname)
        
        # if the node is just in the down state, try to shut it down. 
        if set(["down"]) == states and try_shutdown_pbsnode():
            return
        
        # just ignore complex down nodes (down,job-busy etc) until PBS decides to change the state.
        if "down" in states:
            return
        
        # convert relevant resources from bytes to floating point (GB)
        for key in resources:
            
            value = resources[key]
            if isinstance(value, basestring) and value.lower() in ["true", "false"]:
                value = value.lower() == "true"
            elif isinstance(value, list):
                # TODO will need to support this eventually
                continue
            else:
                try:
                    resources[key] = pbscc.parse_gb_size(key, resources[key])
                except InvalidSizeExpressionError:
                    pass
        
        resources["hostname"] = hostname
        
        nodearray_name = resources.get("nodearray") or resources.get("slot_type")
        group_id = resources.get("group_id")

        if resources.get("machinetype") and nodearray_name:
            machinetype = nodearray_definitions.get_machinetype(nodearray_name, resources.get("machinetype"), group_id)
        else:
            # rely solely on resources_available
            pbscc.debug("machinetype is not defined for host %s, relying only on resources_available" % hostname)
            machinetype = {"availableCount": 1, "name": "undefined"}
            
        inst = machine.new_machine_instance(machinetype, **pbsnode["resources_available"])

        return inst