def fetch_nodearray_definitions(self): ''' A wrapper around the autoscale library function to parse Configuration.autoscale.* chef attributes and add the 'ungrouped' attribute to the machine types. See cyclecloud.nodearrays.NodearrayDefinitions for more info. ''' nodearray_definitions = machine.fetch_nodearray_definitions(self.clusters_api, self.default_placement_attrs) nodearray_definitions.placement_group_optional = True filtered_nodearray_definitions = nodearrays.NodearrayDefinitions() for machinetype in nodearray_definitions: if machinetype.get("disabled", False): continue # ensure that any custom attribute the user specified, like disk = 100G, gets parsed correctly for key, value in machinetype.iteritems(): try: machinetype[key] = pbscc.parse_gb_size(key, value) except InvalidSizeExpressionError: pass # kludge: there is a strange bug where ungrouped is showing up as a string and not a boolean. if not machinetype.get("group_id"): machinetype["ungrouped"] = "true" filtered_nodearray_definitions.add_machinetype(machinetype) else: machinetype["ungrouped"] = "false" filtered_nodearray_definitions.add_machinetype_with_placement_group(machinetype.get("group_id"), machinetype) return filtered_nodearray_definitions
def query_jobs(self): ''' Converts PBS jobs into cyclecloud.job.Job instances. It will also compress jobs that have the exact same requirements. ''' scheduler_config = self.driver.scheduler_config() scheduler_resources = [] + scheduler_config["resources"] # special case for hostname so we can automatically place jobs onto the appropriate host scheduler_resources.append("hostname") scheduler_resources.append("instance_id") group_jobs = not self.disable_grouping running_autoscale_jobs = [] idle_autoscale_jobs = [] # get the raw string outputs first, and convert it second. This somewhat limits the # race condition of asking the status of the queue twice. running_raw_jobs_str, running_converter = self.driver.running_jobs() queued_raw_jobs_str, queued_converter = self.driver.queued_jobs() running_raw_jobs = running_converter(running_raw_jobs_str) queued_raw_jobs = queued_converter(queued_raw_jobs_str) raw_jobs = [] for raw_job in running_raw_jobs: # it is only running on a single node if '+' not in raw_job["exec_vnode"]: raw_jobs.append(raw_job) continue for vnode in raw_job["exec_vnode"].split("+"): sub_raw_job = deepcopy(raw_job) sub_raw_job["exec_vnode"] = vnode raw_jobs.append(sub_raw_job) for raw_job in queued_raw_jobs: if not raw_job["resource_list"].get("select"): raw_jobs.append(raw_job) else: # pbspro, like many schedulers, allows a varying set requirements for nodes in a single submission. # we will break it apart here as if they had split them individually. place = raw_job["resource_list"].get("place") slot_type = raw_job["resource_list"].get("slot_type") chunks = pbscc.parse_select(raw_job) for n, chunk in enumerate(chunks): # only pay the penalty of copies when we actually have multi-chunk jobs sub_raw_job = deepcopy(raw_job) if len(chunks) > 1: sub_raw_job["job_id"] = "%s.%d" % (sub_raw_job["job_id"], n) sub_raw_job["resource_list"] = {} if place: sub_raw_job["resource_list"]["place"] = place if slot_type: sub_raw_job["resource_list"]["slot_type"] = slot_type sub_raw_job["resource_list"]["select"] = pbscc.format_select(chunk) chunk["nodect"] = int(chunk["select"]) if "ncpus" not in chunk: chunk["ncpus"] = "1" for key, value in chunk.iteritems(): if key not in ["select", "nodect"]: try: value = pbscc.parse_gb_size(key, value) * chunk["nodect"] except InvalidSizeExpressionError: pass sub_raw_job["resource_list"][key] = value sub_raw_job["nodect"] = sub_raw_job["resource_list"]["nodect"] = chunk["nodect"] raw_jobs.append(sub_raw_job) warnings = set() raw_jobs = [x for x in raw_jobs if x["job_state"].upper() in [pbscc.JOB_STATE_QUEUED, pbscc.JOB_STATE_RUNNING, pbscc.JOB_STATE_BATCH]] for raw_job in raw_jobs: pbs_job = mockpbs.mock_job(raw_job) nodect = int(pbs_job.Resource_List["nodect"]) if pbs_job["job_state"].upper() == pbscc.JOB_STATE_RUNNING: # update running job live_resources = pbscc.parse_exec_vnode(raw_job["exec_vnode"]) for key, value in live_resources.iteritems(): # live resources are calculated on a per node basis, but the Resource_List is based # on a total basis. # we will normalize this below if isinstance(value, numbers.Number): pbs_job.Resource_List[key] = value * nodect else: pbs_job.Resource_List[key] = value pbs_job["executing_hostname"] = live_resources["hostname"] is_array = bool(pbs_job.get("array", False)) slots_per_job = int(pbs_job.Resource_List['ncpus']) / nodect slot_type = pbs_job.Resource_List["slot_type"] # can be None, similar to {}.get("key"). It is a pbs class. pbscc.info("found slot_type %s." % slot_type) placement = pbscc.parse_place(pbs_job.Resource_List.get("place")) # Note: not sure we will ever support anything but group_id for autoscale purposes. # User could pick, say, group=host, which implies an SMP job, not a parallel job. if placement.get("grouping", "group=group_id") != "group=group_id": placement.pop("grouping") if placement.get("arrangement", "").lower() in ["scatter", "vscatter"]: pack = "scatter" else: pack = "pack" exclusive = placement.get("sharing", "").lower() in ["excl", "exclhost"] # we may need to support sharing at some point, but it seems that we can ignore it for now. _shared = placement.get("sharing") in ["sharing"] placeby = placement.get("grouping") autoscale_job = Job(name=pbs_job["job_id"], nodearray=slot_type, nodes=nodect, packing_strategy=pack, exclusive=exclusive, resources={"ncpus": 0}, executing_hostname=pbs_job.get("executing_hostname")) if placeby: autoscale_job.placeby = placeby.split("=", 1)[-1] if is_array: array_count = 0 array_tasks = raw_job["array_state_count"] # Only grab the first two array task states (queued and running) for ajob in str(array_tasks).split(" ")[:2]: array_count += int(ajob.split(":")[1]) # Multiply the number of cpus needed by number of tasks in the array if array_count != 0: slots_per_job *= array_count else: array_count = 1 # If it's an MPI job and grouping is enabled # we want to use a grouped autoscale_job to get tightly coupled nodes if group_jobs and placement.get("grouping"): autoscale_job['grouped'] = True autoscale_job["nodes"] *= array_count autoscale_job.placeby_value = "single" elif is_array: autoscale_job["nodes"] *= array_count autoscale_job.ncpus += slots_per_job for attr, value in pbs_job.Resource_List.iteritems(): if attr not in scheduler_resources: # if it isn't a scheduler level attribute, don't bother # considering it for autoscale as the scheduler won't respect it either. continue try: value = pbscc.parse_gb_size(attr, value) value = value / nodect except InvalidSizeExpressionError: if value.lower() in ["true", "false"]: value = value.lower() == "true" autoscale_job.resources[attr] = value if raw_job["job_state"] == pbscc.JOB_STATE_QUEUED: idle_autoscale_jobs.append(autoscale_job) else: running_autoscale_jobs.append(autoscale_job) for warning in warnings: format_string, values = warning[0], warning[1:] pbscc.error(format_string % values) # leave an option for disabling this in case it causes issues. if self.cc_config.get("pbspro.compress_jobs", False): all_autoscale_jobs = running_autoscale_jobs + compress_queued_jobs(idle_autoscale_jobs) else: all_autoscale_jobs = running_autoscale_jobs + idle_autoscale_jobs return all_autoscale_jobs
def process_pbsnode(self, pbsnode, instance_ids_to_shutdown, nodearray_definitions): ''' If the pbsnode is offline, will handle evaluating whether the node can be shutdown. See instance_ids_to_shutdown, which is an OUT parameter here. Otherwise convert the pbsnode into a cyclecloud.machine.Machine instance. ''' states = set(pbsnode["state"].split(",")) resources = pbsnode["resources_available"] # host has incorrect case hostname = resources["vnode"] instance_id = resources.get("instance_id", autoscale_util.uuid("instanceid")) def try_shutdown_pbsnode(): if not instance_id: pbscc.error("instance_id was not defined for host %s, can not shut it down" % hostname) elif "down" in states: # don't immediately remove down nodes, give them time to recover from network failure. remove_down_nodes = float(self.cc_config.get("pbspro.remove_down_nodes", 300)) since_down = self.clock.time() - pbsnode["last_state_change_time"] if since_down > remove_down_nodes: pbscc.error("Removing down node %s after %.0f seconds", hostname, since_down) instance_ids_to_shutdown[instance_id] = hostname return True else: omega = remove_down_nodes - since_down pbscc.warn("Not removing down node %s for another %.0f seconds", hostname, omega) else: instance_ids_to_shutdown[instance_id] = hostname return True return False if "offline" in states: if not pbsnode.get("jobs", []): pbscc.fine("%s is offline and has no jobs, may be able to shut down" % hostname) if try_shutdown_pbsnode(): return else: pbscc.fine("Host %s is offline but still running jobs" % hostname) # if the node is just in the down state, try to shut it down. if set(["down"]) == states and try_shutdown_pbsnode(): return # just ignore complex down nodes (down,job-busy etc) until PBS decides to change the state. if "down" in states: return # convert relevant resources from bytes to floating point (GB) for key in resources: value = resources[key] if isinstance(value, basestring) and value.lower() in ["true", "false"]: value = value.lower() == "true" elif isinstance(value, list): # TODO will need to support this eventually continue else: try: resources[key] = pbscc.parse_gb_size(key, resources[key]) except InvalidSizeExpressionError: pass resources["hostname"] = hostname nodearray_name = resources.get("nodearray") or resources.get("slot_type") group_id = resources.get("group_id") if resources.get("machinetype") and nodearray_name: machinetype = nodearray_definitions.get_machinetype(nodearray_name, resources.get("machinetype"), group_id) else: # rely solely on resources_available pbscc.debug("machinetype is not defined for host %s, relying only on resources_available" % hostname) machinetype = {"availableCount": 1, "name": "undefined"} inst = machine.new_machine_instance(machinetype, **pbsnode["resources_available"]) return inst