def _hook(): pbscc.set_application_name("cycle_autoscale") # allow local overrides of jetpack.config or allow non-jetpack masters to define the complete set of settings. overrides = {} if os.path.exists(pbscc.CONFIG_PATH): try: pbscc.warn("overrides exist in file %s" % pbscc.CONFIG_PATH) with open(pbscc.CONFIG_PATH) as fr: overrides = json.load(fr) except Exception: pbscc.error(traceback.format_exc()) sys.exit(1) else: pbscc.debug("No overrides exist in file %s" % pbscc.CONFIG_PATH) cc_config = cyclecloud.config.new_provider_config(overrides=overrides) if len(sys.argv) < 3: # There are no env variables for this as far as I can tell. bin_dir = "/opt/pbs/bin" else: bin_dir = sys.argv[2] if not os.path.isdir(bin_dir): bin_dir = os.path.dirname(bin_dir) clusters_api = clustersapi.ClustersAPI(cc_config.get("cyclecloud.cluster.name"), cc_config) autostart = PBSAutostart(pbs_driver.PBSDriver(bin_dir), clusters_api, cc_config=cc_config) autostart.autoscale()
def compress_queued_jobs(autoscale_jobs): ''' assuming nodearray, num nodes, placeby/placeby_value, exclusivity, packing stategy and of course the requested resources. A compressed job will have the name of the first job with ".compressed" appended to it. i.e. the goal is that if someone does `qsub -l mem=1g` 1000 times, we can compress that so that it appears as if they did `qsub -l select=1000:mem=1g` We do not compress SCATTER jobs. ''' ret = [] compression_buckets = collections.defaultdict(lambda: []) for job in autoscale_jobs: if job.packing_strategy == PackingStrategy.SCATTER: # for now, we will only worry about compressing PACK jobs as an optimization. ret.append(job) continue comp_bucket = (job.nodearray, job.nodes, job.placeby, job.placeby_value, job.exclusive, job.packing_strategy) + tuple(job.resources.items()) compression_buckets[comp_bucket].append(job) for comp_bucket, job_list in compression_buckets.iteritems(): if len(job_list) == 1: ret.extend(job_list) continue first_job = job_list[0] pseudo_job = Job(first_job.name + ".compressed", first_job.nodes * len(job_list), first_job.nodearray, first_job.exclusive, first_job.packing_strategy, first_job.resources, first_job.placeby, first_job.placeby_value) ret.append(pseudo_job) pbscc.debug( "Compressed jobs matching job id %s from %d jobs down to a single job" % (first_job.name, len(job_list))) return ret
def process_pbsnode(self, pbsnode, instance_ids_to_shutdown, nodearray_definitions): ''' If the pbsnode is offline, will handle evaluating whether the node can be shutdown. See instance_ids_to_shutdown, which is an OUT parameter here. Otherwise convert the pbsnode into a cyclecloud.machine.Machine instance. ''' states = set(pbsnode["state"].split(",")) resources = pbsnode["resources_available"] # host has incorrect case hostname = resources["vnode"] instance_id = resources.get("instance_id", autoscale_util.uuid("instanceid")) def try_shutdown_pbsnode(): if not instance_id: pbscc.error("instance_id was not defined for host %s, can not shut it down" % hostname) elif "down" in states: # don't immediately remove down nodes, give them time to recover from network failure. remove_down_nodes = float(self.cc_config.get("pbspro.remove_down_nodes", 300)) since_down = self.clock.time() - pbsnode["last_state_change_time"] if since_down > remove_down_nodes: pbscc.error("Removing down node %s after %.0f seconds", hostname, since_down) instance_ids_to_shutdown[instance_id] = hostname return True else: omega = remove_down_nodes - since_down pbscc.warn("Not removing down node %s for another %.0f seconds", hostname, omega) else: instance_ids_to_shutdown[instance_id] = hostname return True return False if "offline" in states: if not pbsnode.get("jobs", []): pbscc.fine("%s is offline and has no jobs, may be able to shut down" % hostname) if try_shutdown_pbsnode(): return else: pbscc.fine("Host %s is offline but still running jobs" % hostname) # if the node is just in the down state, try to shut it down. if set(["down"]) == states and try_shutdown_pbsnode(): return # just ignore complex down nodes (down,job-busy etc) until PBS decides to change the state. if "down" in states: return # convert relevant resources from bytes to floating point (GB) for key in resources: value = resources[key] if isinstance(value, basestring) and value.lower() in ["true", "false"]: value = value.lower() == "true" elif isinstance(value, list): # TODO will need to support this eventually continue else: try: resources[key] = pbscc.parse_gb_size(key, resources[key]) except InvalidSizeExpressionError: pass resources["hostname"] = hostname nodearray_name = resources.get("nodearray") or resources.get("slot_type") group_id = resources.get("group_id") if resources.get("machinetype") and nodearray_name: machinetype = nodearray_definitions.get_machinetype(nodearray_name, resources.get("machinetype"), group_id) else: # rely solely on resources_available pbscc.debug("machinetype is not defined for host %s, relying only on resources_available" % hostname) machinetype = {"availableCount": 1, "name": "undefined"} inst = machine.new_machine_instance(machinetype, **pbsnode["resources_available"]) return inst
def autoscale(self): ''' The main loop described at the top of this class. Returns machine_requests, idle_machines and total_machines for ease of unit testing. ''' pbscc.info("Begin autoscale cycle") nodearray_definitions = self.fetch_nodearray_definitions() pbsnodes_by_hostname, existing_machines, booting_instance_ids, instance_ids_to_shutdown = self.get_existing_machines(nodearray_definitions) start_enabled = "true" == str(self.cc_config.get("cyclecloud.cluster.autoscale.start_enabled", "true")).lower() if not start_enabled: pbscc.warn("cyclecloud.cluster.autoscale.start_enabled is false, new machines will not be allocated.") autoscaler = autoscalerlib.Autoscaler(nodearray_definitions, existing_machines, self.default_placement_attrs, start_enabled) # throttle how many jobs we attempt to match. When pbspro.compress_jobs is true (default) this shouldn't really be an issue # unless the user has over $pbspro.max_unmatched_jobs unique sets of requirements. max_unmatched_jobs = int(self.cc_config.get("pbspro.max_unmatched_jobs", 10000)) unmatched_jobs = 0 for job in self.query_jobs(): if job.executing_hostname: try: autoscaler.get_machine(hostname=job.executing_hostname).add_job(job, force=True) continue except RuntimeError as e: pbscc.error(str(e)) pass if not autoscaler.add_job(job): unmatched_jobs += 1 pbscc.info("Can not match job %s." % job.name) if max_unmatched_jobs > 0 and unmatched_jobs >= max_unmatched_jobs: pbscc.warn('Maximum number of unmatched jobs reached - %s. To configure this setting, change {"pbspro": "max_unmatched_jobs": N}} in %s' % (unmatched_jobs, pbscc.CONFIG_PATH)) break machine_requests = autoscaler.get_new_machine_requests() idle_machines = autoscaler.get_idle_machines() autoscale_request = autoscale_util.create_autoscale_request(machine_requests) for request_set in autoscale_request["sets"]: configuration = request_set["nodeAttributes"]["Configuration"] if "pbspro" not in configuration: configuration["pbspro"] = {} configuration["pbspro"]["slot_type"] = request_set["nodearray"] if not request_set.get("placementGroupId"): configuration["pbspro"]["is_grouped"] = False else: configuration["pbspro"]["is_grouped"] = True autoscale_util.scale_up(self.clusters_api, autoscale_request) for r in machine_requests: if r.placeby_value: pbscc.info("Requesting %d %s machines in placement group %s for nodearray %s" % (r.instancecount, r.machinetype, r.placeby_value, r.nodearray)) else: pbscc.info("Requesting %d %s machines in nodearray %s" % (r.instancecount, r.machinetype, r.nodearray)) if pbscc.is_fine(): pbscc.fine("New target state of the cluster, including booting nodes:") for m in autoscaler.machines: pbscc.fine(" %s" % str(m)) if instance_ids_to_shutdown: pbscc.info("Shutting down instance ids %s" % instance_ids_to_shutdown.keys()) self.clusters_api.shutdown(instance_ids_to_shutdown.keys()) for hostname in instance_ids_to_shutdown.itervalues(): pbscc.info("Deleting %s" % hostname) self.driver.delete_host(hostname) now = self.clock.time() stop_enabled = "true" == str(self.cc_config.get("cyclecloud.cluster.autoscale.stop_enabled", "true")).lower() if not stop_enabled: pbscc.warn("cyclecloud.cluster.autoscale.stop_enabled is false, idle machines will not be terminated") if stop_enabled: idle_before_threshold = float(self.cc_config.get("cyclecloud.cluster.autoscale.idle_time_before_jobs", 3600)) idle_after_threshold = float(self.cc_config.get("cyclecloud.cluster.autoscale.idle_time_after_jobs", 300)) for m in idle_machines: if m.get_attr("instance_id", "") not in booting_instance_ids: pbscc.debug("Could not find instance id in CycleCloud %s" % m.get_attr("instance_id", "")) continue pbsnode = pbsnodes_by_hostname.get(m.hostname) # the machine may not have converged yet, so if pbsnode: if "busy" in pbsnode["state"]: if "down" in pbsnode["state"]: pbscc.warn("WARNING: %s is down but busy with jobs %s", m.hostname, pbsnode.get("jobs", [])) else: pbscc.error("WARNING: Falsely determined that %s is idle!" % m.hostname) continue last_state_change_time = pbsnode["last_state_change_time"] last_used_time = pbsnode.get("last_used_time") if last_used_time: # last_used_time can be stale while a job is exiting, e.g. last_state_change_time could be < 5 minutes but # somehow last_used_time > 5 minutes, causing us to prematurely terminate the node just because a job took a long time # to exit. last_used_time = max(last_state_change_time, last_used_time) else: last_used_time = self.clock.time() if now - last_used_time > idle_after_threshold: pbscc.info("Setting %s offline after %s seconds" % (m.hostname, now - last_used_time)) self.driver.set_offline(m.hostname) elif now - last_state_change_time > idle_before_threshold: pbscc.info("Setting %s offline after %s seconds" % (m.hostname, now - last_state_change_time)) self.driver.set_offline(m.hostname) pbscc.info("End autoscale cycle") # returned for testing purposes return machine_requests, idle_machines, autoscaler.machines
def query_jobs(self): ''' Converts PBS jobs into cyclecloud.job.Job instances. It will also compress jobs that have the exact same requirements. ''' scheduler_config = self.driver.scheduler_config() scheduler_resources = [] + scheduler_config["resources"] # special case for hostname so we can automatically place jobs onto the appropriate host scheduler_resources.append("hostname") scheduler_resources.append("instance_id") group_jobs = not self.disable_grouping running_autoscale_jobs = [] idle_autoscale_jobs = [] # get the raw string outputs first, and convert it second. This somewhat limits the # race condition of asking the status of the queue twice. running_raw_jobs_str, running_converter = self.driver.running_jobs() queued_raw_jobs_str, queued_converter = self.driver.queued_jobs() queued_raw_arr_obs_str, queued_arr_converter = self.driver.queued_array_jobs( ) running_raw_jobs = running_converter(running_raw_jobs_str) # ignore any array jobs in here - this returns only idle array jobs that haven't started a single task. # so instead, in our next call we will get all job arrays and just ignore those with Queued:0 in array state count. queued_raw_single_jobs = [ x for x in queued_converter(queued_raw_jobs_str) if not x.get("array") ] queued_raw_arr_jobs = queued_arr_converter(queued_raw_arr_obs_str) def sort_by_job_id(raw_job): job_id = raw_job.get("job_id") if not job_id: return -1 for i in range(len(job_id)): if not job_id[i].isdigit(): break return int(job_id[:i]) queued_raw_jobs = sorted(queued_raw_single_jobs + queued_raw_arr_jobs, key=sort_by_job_id) raw_jobs = [] for raw_job in running_raw_jobs: # it is only running on a single node if '+' not in raw_job["exec_vnode"]: raw_jobs.append(raw_job) continue for vnode in raw_job["exec_vnode"].split("+"): sub_raw_job = deepcopy(raw_job) sub_raw_job["exec_vnode"] = vnode raw_jobs.append(sub_raw_job) for raw_job in queued_raw_jobs: if not raw_job["resource_list"].get("select"): raw_jobs.append(raw_job) else: # pbspro, like many schedulers, allows a varying set requirements for nodes in a single submission. # we will break it apart here as if they had split them individually. place = raw_job["resource_list"].get("place") slot_type = raw_job["resource_list"].get("slot_type") chunks = pbscc.parse_select(raw_job) for n, chunk in enumerate(chunks): # only pay the penalty of copies when we actually have multi-chunk jobs sub_raw_job = deepcopy(raw_job) if len(chunks) > 1: sub_raw_job["job_id"] = "%s.%d" % ( sub_raw_job["job_id"], n) sub_raw_job["resource_list"] = {} if place: sub_raw_job["resource_list"]["place"] = place if slot_type: sub_raw_job["resource_list"]["slot_type"] = slot_type sub_raw_job["resource_list"][ "select"] = pbscc.format_select(chunk) chunk["nodect"] = int(chunk["select"]) if "ncpus" not in chunk: chunk["ncpus"] = "1" for key, value in chunk.iteritems(): if key not in ["select", "nodect"]: try: value = pbscc.parse_gb_size( key, value) * chunk["nodect"] except InvalidSizeExpressionError: pass sub_raw_job["resource_list"][key] = value sub_raw_job["nodect"] = sub_raw_job["resource_list"][ "nodect"] = chunk["nodect"] raw_jobs.append(sub_raw_job) warnings = set() raw_jobs = [ x for x in raw_jobs if x["job_state"].upper() in [ pbscc.JOB_STATE_QUEUED, pbscc.JOB_STATE_RUNNING, pbscc.JOB_STATE_BATCH ] ] for raw_job in raw_jobs: pbs_job = mockpbs.mock_job(raw_job) nodect = int(pbs_job.Resource_List["nodect"]) if pbs_job["job_state"].upper() == pbscc.JOB_STATE_RUNNING: # update running job live_resources = pbscc.parse_exec_vnode(raw_job["exec_vnode"]) for key, value in live_resources.iteritems(): # live resources are calculated on a per node basis, but the Resource_List is based # on a total basis. # we will normalize this below if isinstance(value, numbers.Number): pbs_job.Resource_List[key] = value * nodect else: pbs_job.Resource_List[key] = value pbs_job["executing_hostname"] = live_resources["hostname"] is_array = bool(pbs_job.get("array", False)) slots_per_job = int(pbs_job.Resource_List['ncpus']) / nodect slot_type = pbs_job.Resource_List[ "slot_type"] # can be None, similar to {}.get("key"). It is a pbs class. pbscc.info("found slot_type %s." % slot_type) placement = pbscc.parse_place(pbs_job.Resource_List.get("place")) # Note: not sure we will ever support anything but group_id for autoscale purposes. # User could pick, say, group=host, which implies an SMP job, not a parallel job. if placement.get("grouping", "group=group_id") != "group=group_id": placement.pop("grouping") if placement.get("arrangement", "").lower() in ["scatter", "vscatter"]: pack = "scatter" else: pack = "pack" exclusive = placement.get("sharing", "").lower() in ["excl", "exclhost"] # we may need to support sharing at some point, but it seems that we can ignore it for now. _shared = placement.get("sharing") in ["sharing"] placeby = placement.get("grouping") autoscale_job = Job( name=pbs_job["job_id"], nodearray=slot_type, nodes=nodect, packing_strategy=pack, exclusive=exclusive, resources={"ncpus": 0}, executing_hostname=pbs_job.get("executing_hostname")) if placeby: autoscale_job.placeby = placeby.split("=", 1)[-1] if is_array: array_tasks = raw_job["array_state_count"] # we only want the remaining number that are queued. The running tasks are handled separately. # example: "Queued:6 Running:2 Exiting:0 Expired:0" array_count = int(str(array_tasks).split(" ")[0].split(":")[1]) if array_count == 0: pbscc.debug( "Job {} has no remaining tasks. Skipping.".format( raw_job["job_id"])) continue # Multiply the number of slots needed by number of tasks in the array slots_per_job *= array_count else: array_count = 1 # If it's an MPI job and grouping is enabled # we want to use a grouped autoscale_job to get tightly coupled nodes if group_jobs and placement.get("grouping"): autoscale_job['grouped'] = True autoscale_job["nodes"] *= array_count autoscale_job.placeby_value = pbs_job.Resource_List.get( "group_id") or None elif is_array: autoscale_job["nodes"] *= array_count autoscale_job.ncpus += slots_per_job for attr, value in pbs_job.Resource_List.iteritems(): if attr not in scheduler_resources: # if it isn't a scheduler level attribute, don't bother # considering it for autoscale as the scheduler won't respect it either. continue try: value = pbscc.parse_gb_size(attr, value) value = value / nodect except InvalidSizeExpressionError: if value.lower() in ["true", "false"]: value = value.lower() == "true" autoscale_job.resources[attr] = value if raw_job["job_state"] == pbscc.JOB_STATE_QUEUED: idle_autoscale_jobs.append(autoscale_job) else: running_autoscale_jobs.append(autoscale_job) for warning in warnings: format_string, values = warning[0], warning[1:] pbscc.error(format_string % values) # leave an option for disabling this in case it causes issues. if self.cc_config.get("pbspro.compress_jobs", False): all_autoscale_jobs = running_autoscale_jobs + compress_queued_jobs( idle_autoscale_jobs) else: all_autoscale_jobs = running_autoscale_jobs + idle_autoscale_jobs return all_autoscale_jobs