def query_jobs(self): ''' Converts PBS jobs into cyclecloud.job.Job instances. It will also compress jobs that have the exact same requirements. ''' scheduler_config = self.driver.scheduler_config() scheduler_resources = [] + scheduler_config["resources"] # special case for hostname so we can automatically place jobs onto the appropriate host scheduler_resources.append("hostname") scheduler_resources.append("instance_id") group_jobs = not self.disable_grouping running_autoscale_jobs = [] idle_autoscale_jobs = [] # get the raw string outputs first, and convert it second. This somewhat limits the # race condition of asking the status of the queue twice. running_raw_jobs_str, running_converter = self.driver.running_jobs() queued_raw_jobs_str, queued_converter = self.driver.queued_jobs() running_raw_jobs = running_converter(running_raw_jobs_str) queued_raw_jobs = queued_converter(queued_raw_jobs_str) raw_jobs = [] for raw_job in running_raw_jobs: # it is only running on a single node if '+' not in raw_job["exec_vnode"]: raw_jobs.append(raw_job) continue for vnode in raw_job["exec_vnode"].split("+"): sub_raw_job = deepcopy(raw_job) sub_raw_job["exec_vnode"] = vnode raw_jobs.append(sub_raw_job) for raw_job in queued_raw_jobs: if not raw_job["resource_list"].get("select"): raw_jobs.append(raw_job) else: # pbspro, like many schedulers, allows a varying set requirements for nodes in a single submission. # we will break it apart here as if they had split them individually. place = raw_job["resource_list"].get("place") slot_type = raw_job["resource_list"].get("slot_type") chunks = pbscc.parse_select(raw_job) for n, chunk in enumerate(chunks): # only pay the penalty of copies when we actually have multi-chunk jobs sub_raw_job = deepcopy(raw_job) if len(chunks) > 1: sub_raw_job["job_id"] = "%s.%d" % (sub_raw_job["job_id"], n) sub_raw_job["resource_list"] = {} if place: sub_raw_job["resource_list"]["place"] = place if slot_type: sub_raw_job["resource_list"]["slot_type"] = slot_type sub_raw_job["resource_list"]["select"] = pbscc.format_select(chunk) chunk["nodect"] = int(chunk["select"]) if "ncpus" not in chunk: chunk["ncpus"] = "1" for key, value in chunk.iteritems(): if key not in ["select", "nodect"]: try: value = pbscc.parse_gb_size(key, value) * chunk["nodect"] except InvalidSizeExpressionError: pass sub_raw_job["resource_list"][key] = value sub_raw_job["nodect"] = sub_raw_job["resource_list"]["nodect"] = chunk["nodect"] raw_jobs.append(sub_raw_job) warnings = set() raw_jobs = [x for x in raw_jobs if x["job_state"].upper() in [pbscc.JOB_STATE_QUEUED, pbscc.JOB_STATE_RUNNING, pbscc.JOB_STATE_BATCH]] for raw_job in raw_jobs: pbs_job = mockpbs.mock_job(raw_job) nodect = int(pbs_job.Resource_List["nodect"]) if pbs_job["job_state"].upper() == pbscc.JOB_STATE_RUNNING: # update running job live_resources = pbscc.parse_exec_vnode(raw_job["exec_vnode"]) for key, value in live_resources.iteritems(): # live resources are calculated on a per node basis, but the Resource_List is based # on a total basis. # we will normalize this below if isinstance(value, numbers.Number): pbs_job.Resource_List[key] = value * nodect else: pbs_job.Resource_List[key] = value pbs_job["executing_hostname"] = live_resources["hostname"] is_array = bool(pbs_job.get("array", False)) slots_per_job = int(pbs_job.Resource_List['ncpus']) / nodect slot_type = pbs_job.Resource_List["slot_type"] # can be None, similar to {}.get("key"). It is a pbs class. pbscc.info("found slot_type %s." % slot_type) placement = pbscc.parse_place(pbs_job.Resource_List.get("place")) # Note: not sure we will ever support anything but group_id for autoscale purposes. # User could pick, say, group=host, which implies an SMP job, not a parallel job. if placement.get("grouping", "group=group_id") != "group=group_id": placement.pop("grouping") if placement.get("arrangement", "").lower() in ["scatter", "vscatter"]: pack = "scatter" else: pack = "pack" exclusive = placement.get("sharing", "").lower() in ["excl", "exclhost"] # we may need to support sharing at some point, but it seems that we can ignore it for now. _shared = placement.get("sharing") in ["sharing"] placeby = placement.get("grouping") autoscale_job = Job(name=pbs_job["job_id"], nodearray=slot_type, nodes=nodect, packing_strategy=pack, exclusive=exclusive, resources={"ncpus": 0}, executing_hostname=pbs_job.get("executing_hostname")) if placeby: autoscale_job.placeby = placeby.split("=", 1)[-1] if is_array: array_count = 0 array_tasks = raw_job["array_state_count"] # Only grab the first two array task states (queued and running) for ajob in str(array_tasks).split(" ")[:2]: array_count += int(ajob.split(":")[1]) # Multiply the number of cpus needed by number of tasks in the array if array_count != 0: slots_per_job *= array_count else: array_count = 1 # If it's an MPI job and grouping is enabled # we want to use a grouped autoscale_job to get tightly coupled nodes if group_jobs and placement.get("grouping"): autoscale_job['grouped'] = True autoscale_job["nodes"] *= array_count autoscale_job.placeby_value = "single" elif is_array: autoscale_job["nodes"] *= array_count autoscale_job.ncpus += slots_per_job for attr, value in pbs_job.Resource_List.iteritems(): if attr not in scheduler_resources: # if it isn't a scheduler level attribute, don't bother # considering it for autoscale as the scheduler won't respect it either. continue try: value = pbscc.parse_gb_size(attr, value) value = value / nodect except InvalidSizeExpressionError: if value.lower() in ["true", "false"]: value = value.lower() == "true" autoscale_job.resources[attr] = value if raw_job["job_state"] == pbscc.JOB_STATE_QUEUED: idle_autoscale_jobs.append(autoscale_job) else: running_autoscale_jobs.append(autoscale_job) for warning in warnings: format_string, values = warning[0], warning[1:] pbscc.error(format_string % values) # leave an option for disabling this in case it causes issues. if self.cc_config.get("pbspro.compress_jobs", False): all_autoscale_jobs = running_autoscale_jobs + compress_queued_jobs(idle_autoscale_jobs) else: all_autoscale_jobs = running_autoscale_jobs + idle_autoscale_jobs return all_autoscale_jobs
def autoscale(self): ''' The main loop described at the top of this class. Returns machine_requests, idle_machines and total_machines for ease of unit testing. ''' pbscc.info("Begin autoscale cycle") nodearray_definitions = self.fetch_nodearray_definitions() pbsnodes_by_hostname, existing_machines, booting_instance_ids, instance_ids_to_shutdown = self.get_existing_machines(nodearray_definitions) start_enabled = "true" == str(self.cc_config.get("cyclecloud.cluster.autoscale.start_enabled", "true")).lower() if not start_enabled: pbscc.warn("cyclecloud.cluster.autoscale.start_enabled is false, new machines will not be allocated.") autoscaler = autoscalerlib.Autoscaler(nodearray_definitions, existing_machines, self.default_placement_attrs, start_enabled) # throttle how many jobs we attempt to match. When pbspro.compress_jobs is true (default) this shouldn't really be an issue # unless the user has over $pbspro.max_unmatched_jobs unique sets of requirements. max_unmatched_jobs = int(self.cc_config.get("pbspro.max_unmatched_jobs", 10000)) unmatched_jobs = 0 for job in self.query_jobs(): if job.executing_hostname: try: autoscaler.get_machine(hostname=job.executing_hostname).add_job(job, force=True) continue except RuntimeError as e: pbscc.error(str(e)) pass if not autoscaler.add_job(job): unmatched_jobs += 1 pbscc.info("Can not match job %s." % job.name) if max_unmatched_jobs > 0 and unmatched_jobs >= max_unmatched_jobs: pbscc.warn('Maximum number of unmatched jobs reached - %s. To configure this setting, change {"pbspro": "max_unmatched_jobs": N}} in %s' % (unmatched_jobs, pbscc.CONFIG_PATH)) break machine_requests = autoscaler.get_new_machine_requests() idle_machines = autoscaler.get_idle_machines() autoscale_request = autoscale_util.create_autoscale_request(machine_requests) for request_set in autoscale_request["sets"]: configuration = request_set["nodeAttributes"]["Configuration"] if "pbspro" not in configuration: configuration["pbspro"] = {} configuration["pbspro"]["slot_type"] = request_set["nodearray"] if not request_set.get("placementGroupId"): configuration["pbspro"]["is_grouped"] = False else: configuration["pbspro"]["is_grouped"] = True autoscale_util.scale_up(self.clusters_api, autoscale_request) for r in machine_requests: if r.placeby_value: pbscc.info("Requesting %d %s machines in placement group %s for nodearray %s" % (r.instancecount, r.machinetype, r.placeby_value, r.nodearray)) else: pbscc.info("Requesting %d %s machines in nodearray %s" % (r.instancecount, r.machinetype, r.nodearray)) if pbscc.is_fine(): pbscc.fine("New target state of the cluster, including booting nodes:") for m in autoscaler.machines: pbscc.fine(" %s" % str(m)) if instance_ids_to_shutdown: pbscc.info("Shutting down instance ids %s" % instance_ids_to_shutdown.keys()) self.clusters_api.shutdown(instance_ids_to_shutdown.keys()) for hostname in instance_ids_to_shutdown.itervalues(): pbscc.info("Deleting %s" % hostname) self.driver.delete_host(hostname) now = self.clock.time() stop_enabled = "true" == str(self.cc_config.get("cyclecloud.cluster.autoscale.stop_enabled", "true")).lower() if not stop_enabled: pbscc.warn("cyclecloud.cluster.autoscale.stop_enabled is false, idle machines will not be terminated") if stop_enabled: idle_before_threshold = float(self.cc_config.get("cyclecloud.cluster.autoscale.idle_time_before_jobs", 3600)) idle_after_threshold = float(self.cc_config.get("cyclecloud.cluster.autoscale.idle_time_after_jobs", 300)) for m in idle_machines: if m.get_attr("instance_id", "") not in booting_instance_ids: pbscc.debug("Could not find instance id in CycleCloud %s" % m.get_attr("instance_id", "")) continue pbsnode = pbsnodes_by_hostname.get(m.hostname) # the machine may not have converged yet, so if pbsnode: if "busy" in pbsnode["state"]: if "down" in pbsnode["state"]: pbscc.warn("WARNING: %s is down but busy with jobs %s", m.hostname, pbsnode.get("jobs", [])) else: pbscc.error("WARNING: Falsely determined that %s is idle!" % m.hostname) continue last_state_change_time = pbsnode["last_state_change_time"] last_used_time = pbsnode.get("last_used_time") if last_used_time: # last_used_time can be stale while a job is exiting, e.g. last_state_change_time could be < 5 minutes but # somehow last_used_time > 5 minutes, causing us to prematurely terminate the node just because a job took a long time # to exit. last_used_time = max(last_state_change_time, last_used_time) else: last_used_time = self.clock.time() if now - last_used_time > idle_after_threshold: pbscc.info("Setting %s offline after %s seconds" % (m.hostname, now - last_used_time)) self.driver.set_offline(m.hostname) elif now - last_state_change_time > idle_before_threshold: pbscc.info("Setting %s offline after %s seconds" % (m.hostname, now - last_state_change_time)) self.driver.set_offline(m.hostname) pbscc.info("End autoscale cycle") # returned for testing purposes return machine_requests, idle_machines, autoscaler.machines