def _get_jobs(): job_details = get_sge_job_details() jobs = get_sge_jobs() # Process all the jobs autoscale_jobs = [] for job in jobs: # Ignore jobs in "held" or "error" states if job["job_state"] in ["h", "e"]: continue # Ignore jobs that are no longer in qstat output - added 20FEB2018 JLouie if job["job_number"] not in job_details: print "PATCHED: Job %s is no longer in qstat details. Skipping..." % ( job["job_number"]) continue detail = job_details[job["job_number"]] slot_type = None if 'hard_resources' in detail: slot_type = detail["hard_resources"].get("slot_type", None) slots_per_job = 1 if 'pe_range' in detail and 'min' in detail['pe_range']: slots_per_job = int(detail['pe_range']['min']) average_runtime = None if 'context' in detail and 'average_runtime' in detail['context']: average_runtime = int(detail['context']['average_runtime']) job = { 'name': job['job_number'], 'nodearray': slot_type, 'request_cpus': slots_per_job, 'average_runtime': average_runtime } # If it's an MPI job and grouping is enabled # we want to use a grouped request to get tightly coupled nodes if slots_per_job > 1 and jetpack.config.get( 'cyclecloud.cluster.autoscale.use_node_groups') is True: job['grouped'] = True autoscale_jobs.append(job) return autoscale_jobs
def sge_job_handler(job_details): """ Takes in a 'job_details' and returns a potentially updated job details object... TODO: What are they really allowed to update? Just hard resources "-l <foo>=<bar>"??? """ def _get(name): """ Does the name exist in hard or soft resources """ if 'hard_resources' in job_details and name in job_details[ 'hard_resources']: return job_details['hard_resources'][name] elif 'soft_resources' in job_details and name in job_details[ 'soft_resources']: return job_details['soft_resources'][name] else: return None details = {} # Set the slot type if it isn't already set slot_type = _get('slot_type') if slot_type is None: slot_type = 'execute' details['slot_type'] = slot_type # Set the affinity group if grouping is enabled, it isn't already set # and it's a MPI job groups_enabled = jetpack.config.get( 'cyclecloud.cluster.autoscale.use_node_groups') affinity_group = _get('affinity_group') if groups_enabled and affinity_group is None and 'pe_range' in job_details and 'min' in job_details[ 'pe_range']: # Find an affinity_group for the node job_size = job_details['pe_range']['min'] if len(jobs_by_affinity_group) == 0: for job_id, job in get_sge_job_details().iteritems(): st = None ag = None if 'hard_resources' in job and 'slot_type' in job[ 'hard_resources']: st = job['hard_resources']['slot_type'] elif 'soft_resources' in job and 'slot_type' in job[ 'soft_resources']: st = job['soft_resources']['slot_type'] if 'hard_resources' in job and 'affinity_group' in job[ 'hard_resources']: ag = job['hard_resources']['affinity_group'] elif 'soft_resources' in job and 'affinity_group' in job[ 'soft_resources']: ag = job['soft_resources']['affinity_group'] if st and ag: jobs_by_affinity_group[(st, ag)] = job_id host_complexes = get_host_complexes( ['slot_type', 'affinity_group', 'affinity_group_cores']) affinity_groups = [ hc['affinity_group'] for hc in host_complexes.itervalues() if hc['slot_type'] == slot_type and hc['affinity_group'] not in [None, 'default'] and int(float(hc['affinity_group_cores'] or 0)) == int(job_size) ] for g in affinity_groups: if (slot_type, g) in jobs_by_affinity_group: continue else: jobs_by_affinity_group[(slot_type, g)] = job_details['job_number'] details['affinity_group'] = g break else: # We just use 'default' for the affinity group details['affinity_group'] = affinity_group or 'default' return details
return details try: sys.path.append("/opt/cycle/jetpack/config") from autoscale import sge_job_handler # This should always fail - "blah" should be a module name that we define for all autoscaling stuff except ImportError: pass # The default function above will be used instead if __name__ == "__main__": print "%s" % datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") pidfile = "/var/run/modify_jobs.pid" makePidfile(pidfile) job_details = get_sge_job_details() # Check for updates to the job resources = {} for job_detail in job_details.itervalues(): updates = sge_job_handler(job_detail) updates_to_apply = {} # Check the updates that were sent back to make sure they aren't already set, no need to reset them for k, v in updates.iteritems(): if k in job_detail['hard_resources'] and job_detail[ 'hard_resources'][k] == v: pass else: updates_to_apply[k] = v if updates_to_apply:
def _get_jobs(): job_details = get_sge_job_details() jobs = get_sge_jobs() metadata_url = "http://169.254.169.254/metadata/instance?api-version=2017-08-01" metadata_req = Request(metadata_url, headers={"Metadata" : True}) metadata_response = urlopen(metadata_req) vm_metadata = json.load(metadata_response) subscription_id = vm_metadata["compute"]["subscriptionId"] node_config = jetpack.config.get() # Process all the jobs autoscale_requests = [] for job in jobs: # Ignore jobs in "held" or "error" states if "h" in job["job_state"] or "e" in job["job_state"]: continue detail = job_details[job["job_number"]] slot_type = None if 'hard_resources' in detail: slot_type = detail["hard_resources"].get("slot_type", None) slots_per_job = 1 if 'pe_range' in detail and 'min' in detail['pe_range']: slots_per_job = int(detail['pe_range']['min']) average_runtime = None if 'context' in detail and 'average_runtime' in detail['context']: average_runtime = int(detail['context']['average_runtime']) autoscale_request = { 'Name': slot_type, 'TargetCoreCount': slots_per_job } image_resource_id = None if slot_type != "execute": if 'context' in detail and 'image_id' in detail['context']: image_resource_id = detail['context']['image_id'] if image_resource_id is None: print "WARNING: Job %s has non-execute slottype, but application image name not provided. Autoscaling %s nodes." % (job["job_number"], slot_type) print "WARNING: Job details %s" % detail else: autoscale_request = { 'Name': slot_type, 'Extends': 'execute', 'ImageId': image_resource_id, 'TargetCoreCount': slots_per_job, 'Configuration': { 'gridengine': { 'slot_type' : slot_type } }, 'Dynamic': True } if 'context' in detail and 'machine_type' in detail['context']: autoscale_request["MachineType"] = detail['context']['machine_type'] if 'context' in detail and 'ppn' in detail['context']: ppn = detail['context']['ppn'] autoscale_request['Configuration']['gridengine']['slots'] = ppn autoscale_request['CoreCount'] = ppn if 'context' in detail and 'autostop' in detail['context']: # cyclecloud.cluster.autoscale.stop_enabled if "Configuration" not in autoscale_request: autoscale_request["Configuration"] = {} stop_enabled = True if detail['context']['autostop'].lower() == 'false': stop_enabled = False autoscale_request["Configuration"]["cyclecloud"] = { 'cluster': { 'autoscale': { 'stop_enabled': stop_enabled } } } print "Autoscale req: %s" % autoscale_request autoscale_requests.append(autoscale_request) return autoscale_requests