コード例 #1
0
    def process_pbsnode(self, pbsnode, instance_ids_to_shutdown, nodearray_definitions):
        '''
            If the pbsnode is offline, will handle evaluating whether the node can be shutdown. See instance_ids_to_shutdown, which
            is an OUT parameter here.
            
            Otherwise convert the pbsnode into a cyclecloud.machine.Machine instance.
        '''
        
        states = set(pbsnode["state"].split(","))
        resources = pbsnode["resources_available"]
        # host has incorrect case
        hostname = resources["vnode"]
        
        instance_id = resources.get("instance_id", autoscale_util.uuid("instanceid"))
        
        def try_shutdown_pbsnode():
            if not instance_id:
                pbscc.error("instance_id was not defined for host %s, can not shut it down" % hostname)
            elif "down" in states:
                # don't immediately remove down nodes, give them time to recover from network failure.
                remove_down_nodes = float(self.cc_config.get("pbspro.remove_down_nodes", 300))
                since_down = self.clock.time() - pbsnode["last_state_change_time"]
                if since_down > remove_down_nodes:
                    pbscc.error("Removing down node %s after %.0f seconds", hostname, since_down)
                    instance_ids_to_shutdown[instance_id] = hostname
                    return True
                else:
                    omega = remove_down_nodes - since_down
                    pbscc.warn("Not removing down node %s for another %.0f seconds", hostname, omega)
            else:
                instance_ids_to_shutdown[instance_id] = hostname
                return True
            
            return False
        
        if "offline" in states:
            if not pbsnode.get("jobs", []):
                pbscc.fine("%s is offline and has no jobs, may be able to shut down" % hostname)
                if try_shutdown_pbsnode():
                    return
            else:
                pbscc.fine("Host %s is offline but still running jobs" % hostname)
        
        # if the node is just in the down state, try to shut it down. 
        if set(["down"]) == states and try_shutdown_pbsnode():
            return
        
        # just ignore complex down nodes (down,job-busy etc) until PBS decides to change the state.
        if "down" in states:
            return
        
        # convert relevant resources from bytes to floating point (GB)
        for key in resources:
            
            value = resources[key]
            if isinstance(value, basestring) and value.lower() in ["true", "false"]:
                value = value.lower() == "true"
            elif isinstance(value, list):
                # TODO will need to support this eventually
                continue
            else:
                try:
                    resources[key] = pbscc.parse_gb_size(key, resources[key])
                except InvalidSizeExpressionError:
                    pass
        
        resources["hostname"] = hostname
        
        nodearray_name = resources.get("nodearray") or resources.get("slot_type")
        group_id = resources.get("group_id")

        if resources.get("machinetype") and nodearray_name:
            machinetype = nodearray_definitions.get_machinetype(nodearray_name, resources.get("machinetype"), group_id)
        else:
            # rely solely on resources_available
            pbscc.debug("machinetype is not defined for host %s, relying only on resources_available" % hostname)
            machinetype = {"availableCount": 1, "name": "undefined"}
            
        inst = machine.new_machine_instance(machinetype, **pbsnode["resources_available"])

        return inst
コード例 #2
0
    def autoscale(self):
        '''
            The main loop described at the top of this class. 
            Returns machine_requests, idle_machines and total_machines for ease of unit testing.
        '''
        pbscc.info("Begin autoscale cycle")
        
        nodearray_definitions = self.fetch_nodearray_definitions()
        
        pbsnodes_by_hostname, existing_machines, booting_instance_ids, instance_ids_to_shutdown = self.get_existing_machines(nodearray_definitions)
        
        start_enabled = "true" == str(self.cc_config.get("cyclecloud.cluster.autoscale.start_enabled", "true")).lower()
        
        if not start_enabled:
            pbscc.warn("cyclecloud.cluster.autoscale.start_enabled is false, new machines will not be allocated.")
        
        autoscaler = autoscalerlib.Autoscaler(nodearray_definitions, existing_machines, self.default_placement_attrs, start_enabled)
        
        # throttle how many jobs we attempt to match. When pbspro.compress_jobs is true (default) this shouldn't really be an issue
        # unless the user has over $pbspro.max_unmatched_jobs unique sets of requirements.
        max_unmatched_jobs = int(self.cc_config.get("pbspro.max_unmatched_jobs", 10000))
        unmatched_jobs = 0
        
        for job in self.query_jobs():
            if job.executing_hostname:
                try:
                    autoscaler.get_machine(hostname=job.executing_hostname).add_job(job, force=True)
                    continue
                except RuntimeError as e:
                    pbscc.error(str(e))
                    pass
                    
            if not autoscaler.add_job(job):
                unmatched_jobs += 1
                pbscc.info("Can not match job %s." % job.name)
                if max_unmatched_jobs > 0 and unmatched_jobs >= max_unmatched_jobs:
                    pbscc.warn('Maximum number of unmatched jobs reached - %s. To configure this setting, change {"pbspro": "max_unmatched_jobs": N}} in %s' % (unmatched_jobs, pbscc.CONFIG_PATH))
                    break
        
        machine_requests = autoscaler.get_new_machine_requests()
        idle_machines = autoscaler.get_idle_machines()
        
        autoscale_request = autoscale_util.create_autoscale_request(machine_requests)
        for request_set in autoscale_request["sets"]:
            configuration = request_set["nodeAttributes"]["Configuration"]
            
            if "pbspro" not in configuration:
                    configuration["pbspro"] = {}
            
            configuration["pbspro"]["slot_type"] = request_set["nodearray"]
            if not request_set.get("placementGroupId"):
                configuration["pbspro"]["is_grouped"] = False
            else:
                configuration["pbspro"]["is_grouped"] = True
                
        autoscale_util.scale_up(self.clusters_api, autoscale_request)
        
        for r in machine_requests:
            if r.placeby_value:
                pbscc.info("Requesting %d %s machines in placement group %s for nodearray %s" % (r.instancecount, r.machinetype, r.placeby_value, r.nodearray))
            else:
                pbscc.info("Requesting %d %s machines in nodearray %s" % (r.instancecount, r.machinetype, r.nodearray))
        
        if pbscc.is_fine():
            pbscc.fine("New target state of the cluster, including booting nodes:")
            
            for m in autoscaler.machines:
                pbscc.fine("    %s" % str(m))
        
        if instance_ids_to_shutdown:
            pbscc.info("Shutting down instance ids %s" % instance_ids_to_shutdown.keys())
            self.clusters_api.shutdown(instance_ids_to_shutdown.keys())
            
            for hostname in instance_ids_to_shutdown.itervalues():
                pbscc.info("Deleting %s" % hostname)
                self.driver.delete_host(hostname)
        
        now = self.clock.time()
        
        stop_enabled = "true" == str(self.cc_config.get("cyclecloud.cluster.autoscale.stop_enabled", "true")).lower()
        
        if not stop_enabled:
            pbscc.warn("cyclecloud.cluster.autoscale.stop_enabled is false, idle machines will not be terminated")
        
        if stop_enabled:
            idle_before_threshold = float(self.cc_config.get("cyclecloud.cluster.autoscale.idle_time_before_jobs", 3600))
            idle_after_threshold = float(self.cc_config.get("cyclecloud.cluster.autoscale.idle_time_after_jobs", 300))
        
            for m in idle_machines:
                if m.get_attr("instance_id", "") not in booting_instance_ids:
                    pbscc.debug("Could not find instance id in CycleCloud %s" % m.get_attr("instance_id", ""))
                    continue
                
                pbsnode = pbsnodes_by_hostname.get(m.hostname)
                
                # the machine may not have converged yet, so
                if pbsnode:
                    if "busy" in pbsnode["state"]:
                        if "down" in pbsnode["state"]:
                            pbscc.warn("WARNING: %s is down but busy with jobs %s", m.hostname, pbsnode.get("jobs", []))
                        else:
                            pbscc.error("WARNING: Falsely determined that %s is idle!" % m.hostname)
                        continue
                    
                    last_state_change_time = pbsnode["last_state_change_time"]
                    last_used_time = pbsnode.get("last_used_time")
                    if last_used_time:
                        # last_used_time can be stale while a job is exiting, e.g. last_state_change_time could be < 5 minutes but
                        # somehow last_used_time > 5 minutes, causing us to prematurely terminate the node just because a job took a long time
                        # to exit.
                        last_used_time = max(last_state_change_time, last_used_time)
                    else:
                        last_used_time = self.clock.time()

                    if now - last_used_time > idle_after_threshold:
                        pbscc.info("Setting %s offline after %s seconds" % (m.hostname, now - last_used_time))
                        self.driver.set_offline(m.hostname)
                    elif now - last_state_change_time > idle_before_threshold:
                        pbscc.info("Setting %s offline after %s seconds" % (m.hostname, now - last_state_change_time))
                        self.driver.set_offline(m.hostname)
        
        pbscc.info("End autoscale cycle")
        # returned for testing purposes
        return machine_requests, idle_machines, autoscaler.machines