Beispiel #1
0
    def get_existing_machines(self, nodearray_definitions):
        '''
            Queries pbsnodes and CycleCloud to get a sane set of cyclecloud.machine.Machine instances that represent the current state of the cluster.
        '''
        pbsnodes = self.driver.pbsnodes().get(None)
        existing_machines = []

        booting_instance_ids = autoscale_util.nodes_by_instance_id(
            self.clusters_api, nodearray_definitions)

        instance_ids_to_shutdown = Record()

        nodes_by_instance_id = Record()

        for pbsnode in pbsnodes.values():
            inst = self.process_pbsnode(pbsnode, instance_ids_to_shutdown,
                                        nodearray_definitions)
            if not inst:
                continue
            existing_machines.append(inst)

            # we found the pbsnode that matches the cyclecloud node, so let's remove the duplicate
            instance_id = inst.get_attr("instance_id", "")
            if instance_id in booting_instance_ids:
                booting_instance_ids.pop(instance_id)
            nodes_by_instance_id[instance_id] = pbsnode

        for instance_id, node in list(booting_instance_ids.iteritems()):
            nodearray_name = node["Template"]
            machinetype_name = node["MachineType"]

            try:
                machinetype = nodearray_definitions.get_machinetype(
                    nodearray_name, machinetype_name,
                    node.get("PlacementGroupId"))
            except KeyError as e:
                raise ValueError(
                    "machine is %s, key is %s, rest is %s" %
                    (nodearray_name, str(e), nodearray_definitions))

            inst = machine.new_machine_instance(
                machinetype,
                hostname=node.get("hostname"),
                instance_id=node.get("InstanceId"),
                group_id=node.get("placementGroupId"))
            existing_machines.append(inst)
            nodes_by_instance_id[instance_id] = node

        return pbsnodes, existing_machines, nodes_by_instance_id, instance_ids_to_shutdown
Beispiel #2
0
 def get_existing_machines(self, nodearray_definitions):
     '''
         Queries pbsnodes and CycleCloud to get a sane set of cyclecloud.machine.Machine instances that represent the current state of the cluster.
     '''
     pbsnodes = self.driver.pbsnodes().get(None)
     existing_machines = []
     
     booting_instance_ids = autoscale_util.nodes_by_instance_id(self.clusters_api, nodearray_definitions)
     
     instance_ids_to_shutdown = Record()
     
     nodes_by_instance_id = Record()
     
     for pbsnode in pbsnodes.values():
         instance_id = pbsnode["resources_available"].get("instance_id", "")
         # use this opportunity to set some things that can change during the runtime (keepalive) or are not always set
         # by previous versions (machinetype/nodearray)
         if instance_id and booting_instance_ids.get(instance_id):
             node = booting_instance_ids.get(instance_id)
             pbsnode["resources_available"]["keep_alive"] = node.get("KeepAlive", False)
             pbsnode["resources_available"]["machinetype"] = pbsnode["resources_available"].get("machinetype") or node.get("MachineType")
             pbsnode["resources_available"]["nodearray"] = pbsnode["resources_available"].get("nodearray") or node.get("Template")
             
         inst = self.process_pbsnode(pbsnode, instance_ids_to_shutdown, nodearray_definitions)
         if not inst:
             continue
         existing_machines.append(inst)
         
         # we found the pbsnode that matches the cyclecloud node, so let's remove the duplicate 
         instance_id = inst.get_attr("instance_id", "")
         if instance_id in booting_instance_ids:
             booting_instance_ids.pop(instance_id)
         nodes_by_instance_id[instance_id] = pbsnode
     
     for instance_id, node in list(booting_instance_ids.items()):
         nodearray_name = node["Template"]
         machinetype_name = node["MachineType"]
         
         try:
             machinetype = nodearray_definitions.get_machinetype(nodearray_name, machinetype_name, node.get("PlacementGroupId"))
         except KeyError as e:
             raise ValueError("machine is %s, key is %s, rest is %s" % (nodearray_name, str(e), nodearray_definitions))
         
         inst = machine.new_machine_instance(machinetype, hostname=node.get("hostname"), instance_id=node.get("InstanceId"), group_id=node.get("placementGroupId"), keep_alive=node.get("KeepAlive", False))
         existing_machines.append(inst)
         nodes_by_instance_id[instance_id] = node
     
     return pbsnodes, existing_machines, nodes_by_instance_id, instance_ids_to_shutdown
    def process_pbsnode(self, pbsnode, instance_ids_to_shutdown, nodearray_definitions):
        '''
            If the pbsnode is offline, will handle evaluating whether the node can be shutdown. See instance_ids_to_shutdown, which
            is an OUT parameter here.
            
            Otherwise convert the pbsnode into a cyclecloud.machine.Machine instance.
        '''
        
        states = set(pbsnode["state"].split(","))
        resources = pbsnode["resources_available"]
        # host has incorrect case
        hostname = resources["vnode"]
        
        instance_id = resources.get("instance_id", autoscale_util.uuid("instanceid"))
        
        def try_shutdown_pbsnode():
            if not instance_id:
                pbscc.error("instance_id was not defined for host %s, can not shut it down" % hostname)
            elif "down" in states:
                # don't immediately remove down nodes, give them time to recover from network failure.
                remove_down_nodes = float(self.cc_config.get("pbspro.remove_down_nodes", 300))
                since_down = self.clock.time() - pbsnode["last_state_change_time"]
                if since_down > remove_down_nodes:
                    pbscc.error("Removing down node %s after %.0f seconds", hostname, since_down)
                    instance_ids_to_shutdown[instance_id] = hostname
                    return True
                else:
                    omega = remove_down_nodes - since_down
                    pbscc.warn("Not removing down node %s for another %.0f seconds", hostname, omega)
            else:
                instance_ids_to_shutdown[instance_id] = hostname
                return True
            
            return False
        
        if "offline" in states:
            if not pbsnode.get("jobs", []):
                pbscc.fine("%s is offline and has no jobs, may be able to shut down" % hostname)
                if try_shutdown_pbsnode():
                    return
            else:
                pbscc.fine("Host %s is offline but still running jobs" % hostname)
        
        # if the node is just in the down state, try to shut it down. 
        if set(["down"]) == states and try_shutdown_pbsnode():
            return
        
        # just ignore complex down nodes (down,job-busy etc) until PBS decides to change the state.
        if "down" in states:
            return
        
        # convert relevant resources from bytes to floating point (GB)
        for key in resources:
            
            value = resources[key]
            if isinstance(value, basestring) and value.lower() in ["true", "false"]:
                value = value.lower() == "true"
            elif isinstance(value, list):
                # TODO will need to support this eventually
                continue
            else:
                try:
                    resources[key] = pbscc.parse_gb_size(key, resources[key])
                except InvalidSizeExpressionError:
                    pass
        
        resources["hostname"] = hostname
        
        nodearray_name = resources.get("nodearray") or resources.get("slot_type")
        group_id = resources.get("group_id")

        if resources.get("machinetype") and nodearray_name:
            machinetype = nodearray_definitions.get_machinetype(nodearray_name, resources.get("machinetype"), group_id)
        else:
            # rely solely on resources_available
            pbscc.debug("machinetype is not defined for host %s, relying only on resources_available" % hostname)
            machinetype = {"availableCount": 1, "name": "undefined"}
            
        inst = machine.new_machine_instance(machinetype, **pbsnode["resources_available"])

        return inst