def test_hostlist_create_new(): """Hostlist: Create new hostlist""" hl = pyslurm.hostlist() hosts = "c1, c[2-3]" assert_true(hl.create(hosts)) assert_equals(hl.count(), 3) assert_equals(hl.find("c3"), 2) assert_equals(hl.push("c[4-5]"), 2) assert_equals(hl.count(), 5) assert_equals(hl.push_host("c6"), 1) assert_equals(hl.ranged_string(), "c[1-6]") assert_equals(hl.shift(), "c1") assert_equals(hl.count(), 5) assert_equals(hl.push_host("c6"), 1) assert_equals(hl.ranged_string(), "c[2-6,6]") hl.uniq() assert_equals(hl.ranged_string(), "c[2-6]") hl.destroy() assert_equals(hl.count(), -1)
def test_hostlist_create_new(): """Hostlist: Create new hostlist""" hl = pyslurm.hostlist() hosts = "c1, c[2-3]" assert_true(hl.create(hosts)) assert_equals(hl.count(), 3) assert_equals(hl.find("c3"), 2) assert_equals(hl.push("c[4-5]"), 2) assert_equals(hl.count(), 5) assert_equals(hl.push_host("c6"), 1) assert_equals(hl.ranged_string(), "c[1-6]") assert_equals(hl.shift(), "c1") assert_equals(hl.count(), 5) assert_equals(hl.push_host("c6"), 1) assert_equals(hl.ranged_string(), "c[2-6,6]") hl.uniq() assert_equals(hl.ranged_string(), "c[2-6]") hl.destroy() assert_equals(hl.count(), -1)
def test_hostlist_create_new(): """Hostlist: Create new hostlist""" hl = pyslurm.hostlist() hosts = "c1, c[2-3]" assert hl.create(hosts) assert hl.count() == 3 assert hl.find("c3") == 2 assert hl.push("c[4-5]") == 2 assert hl.count() == 5 assert hl.push_host("c6") == 1 assert hl.ranged_string() == "c[1-6]" assert hl.shift() == "c1" assert hl.count() == 5 assert hl.push_host("c6") == 1 assert hl.ranged_string() == "c[2-6,6]" hl.uniq() assert hl.ranged_string() == "c[2-6]" hl.destroy() assert hl.count() == -1
def test_hostlist_create_empty(): """Hostlist: Test create empty hostlist.""" hl = pyslurm.hostlist() hl.create() assert_equals(hl.count(), 0) hl.destroy() assert_equals(hl.count(), -1)
def test_hostlist_create_empty(): """Hostlist: Test create empty hostlist.""" hl = pyslurm.hostlist() hl.create() assert hl.count() == 0 hl.destroy() assert hl.count() == -1
def __init__(self): self.noFarm = ['slurmweb', 'huematrix'] self.nodeInformations = {} self.__hosts = hostlist() self.__node = node() self.SearchNodes() self.nodes = sorted(self.nodeInformations.keys()) self.state = '' self.node = ''
def test_hostlist_create_empty(): """Hostlist: Test create empty hostlist.""" hl = pyslurm.hostlist() hl.create() assert_equals(hl.count(), 0) hl.destroy() assert_equals(hl.count(), -1)
#!/usr/bin/env python """ Retrieve Slurm hosts """ from __future__ import print_function import pyslurm b = pyslurm.hostlist() hosts = "dummy0,dummy1,dummy1,dummy3,dummy4" print("Creating hostlist ...... with {0}".format(hosts)) if b.create(hosts): print() print("\tHost list count is {0}".format(b.count())) node = "dummy3" pos = b.find(node) if pos == -1: print("Failed to find {0} in list".format(node)) else: print("\tHost {0} found at position {1}".format(node, pos)) print("\tCalling uniq on current host list") b.uniq() print("\tNew host list is {0}".format(b.get())) print("\tNew host list count is {0}".format(b.count())) pos = b.find(node) if pos == -1: print("Failed to find {0} in list".format(node)) else: print("\tHost {0} found at position {1}".format(node, pos))
#!/usr/bin/env python from __future__ import print_function import pyslurm b = pyslurm.hostlist() hosts = "dummy0,dummy1,dummy1,dummy3,dummy4" print("Creating hostlist ...... with {0}".format(hosts)) if b.create(hosts): print() print("\tHost list count is {0}".format(b.count())) node = "dummy3" pos = b.find(node) if pos == -1: print("Failed to find {0} in list".format(node)) else: print("\tHost {0} found at position {1}".format(node, pos)) print("\tCalling uniq on current host list") b.uniq() print("\tNew host list is {0}".format(b.get())) print("\tNew host list count is {0}".format(b.count())) pos = b.find(node) if pos == -1: print("Failed to find {0} in list".format(node)) else: print("\tHost {0} found at position {1}".format(node, pos))
def get_core_usage(data): # split gpu cores from cpu cores for id, job in data["jobs"].items(): if job["state"] == "RUNNING": for n, l in job["layout"].items(): # data['nodes'][n]['gpuJobsOnNode'].append(id) # data['nodes'][n]['gpuJobsCoresUsed'] += job['nCpus'] k = "cpuLayout" r = "cpuJobCnt" if job["nGpus"] > 0: k = "gpuLayout" r = "gpuJobCnt" data["nodes"][n][k].extend(l) data["nodes"][n][r] += 1 elif job["state"] == "PENDING": # store future jobs with the nodes they're going to run on if job["startTime"] != 0 and job["schedNodes"] is not None: b = pyslurm.hostlist() b.create(job["schedNodes"]) for n in b.get_list(): n = str(n, "utf-8") # print(n,id) data["nodes"][n]["futureJobs"].append(id) for hostname, node in data["nodes"].items(): node["nCpuCores"] = len(node["cpuLayout"]) node["nGpuCores"] = len(node["gpuLayout"]) if debug: if hostname in [ "john6", "john99", "gstar102", "sstar011", "sstar107", "sstar301", "gstar201", "bryan1", "gstar040", "gina3", "john32", "john34", ]: print(hostname, node) # bins of cores free bcu = {} u = {} for part, prefixes in PARTITIONS.items(): # running/alloc, idle/avail, offline, blocked/unschedulable/inaccessible, total - for cores and nodes and gpus usage = { "cores": { "a": 0, "i": 0, "o": 0, "b": 0, "t": 0 }, "nodes": { "a": 0, "i": 0, "o": 0, "b": 0, "t": 0 }, "gpus": { "a": 0, "i": 0, "o": 0, "b": 0, "t": 0 }, } bc = bins() for hostname, node in data["nodes"].items(): for prefix in prefixes: if prefix in hostname: # gpu jobs get 4 cores (in skylake-gpu), but can use more gpuCoresMax = max(partitionResGpuCores[part], node["nGpuCores"]) # cpu jobs get the rest. ie. max 32 on skylake cpuCoresMax = node["nCpus"] - gpuCoresMax nCpuIdleCores = cpuCoresMax - node["nCpuCores"] nGpuIdleCores = gpuCoresMax - node["nGpuCores"] # which type of cores we are considering ty = "nCpuCores" if part == "skylake-gpu": ty = "nGpuCores" # unavail nodes do not contribute to free counts if node["avail"]: # can't schedule jobs on these cores for a variety of reasons unsched = 0 # find idle and idle per-socket layouts if part == "skylake": # ideally gpus would have 2 cores reserved on each socket but that's not actually # the case - they get >= 4 cores overall. # assume 0-17 on socket0, 18-35 on socket1 s0 = 0 s1 = 0 g0 = 0 g1 = 0 for i in node["cpuLayout"]: if i < 18: s0 += 1 else: s1 += 1 for i in node["gpuLayout"]: if i < 18: g0 += 1 else: g1 += 1 if debug: if s0 + s1 != node["nCpuCores"]: print( hostname, "err: cpu: ", s0, "+", s1, "!=", node["nCpuCores"], "node", node, ) if g0 + g1 != node["nGpuCores"]: print( hostname, "err: gpu: ", g0, "+", g1, "!=", node["nGpuCores"], "node", node, ) # g0+g1 = 0 - 4 cores held idle (no gpu jobs running) # g0+g1 >= 4 - 0 cores held idle # but we could be a weird situation where previous gpu jobs ( # eg. 4 cores, 1 gpu) have skewed the cpus used # on each socket, so even though if there are cores avail we don't # know where they'll be. cpuCoresIdle0 = 18 - (s0 + g0) cpuCoresIdle1 = 18 - (s1 + g1) # also need to take account of inaccessible. # eg. all gpus used, but not all 4 gpu cores used. so some cores unschedulable gpuRes = g0 + g1 if (node["nGpusUsed"] == node["nGpus"] ): # both gpus being used if gpuRes < 4: unsched += 4 - gpuRes # just round-robin subtract for i in range(0, 4 - gpuRes): if i % 2: if cpuCoresIdle0 == 0: cpuCoresIdle1 -= 1 else: cpuCoresIdle0 -= 1 else: if cpuCoresIdle1 == 0: cpuCoresIdle0 -= 1 else: cpuCoresIdle1 -= 1 if debug: if cpuCoresIdle0 < 0 or cpuCoresIdle1 < 0: print( "nope, you stuffed up idle", cpuCoresIdle0, cpuCoresIdle1, "node", node, ) if cpuCoresIdle0 + cpuCoresIdle1 != nCpuIdleCores: print( hostname, "err", cpuCoresIdle0, "+", cpuCoresIdle1, "+", unsched, "!=", nCpuIdleCores, "node", node, ) # also need to take account of inaccessible. # 1-4 cores slots for cpu jobs on non-smalljobs, non-largemem nodes will never be used if ( cpuCoresIdle0 + cpuCoresIdle1 <= 16 ): # small jobs could run and would have socket affinity set if not ("smalljobs" in node["features"] or "largemem" in node["features"]): if cpuCoresIdle0 != 0 and cpuCoresIdle0 <= 4: unsched += cpuCoresIdle0 cpuCoresIdle0 = 0 if debug: print( "not smalljobs", "cpuCoresIdle0", cpuCoresIdle0, "cpuCoresIdle1", cpuCoresIdle1, "unsched", unsched, hostname, node, ) if cpuCoresIdle1 != 0 and cpuCoresIdle1 <= 4: unsched += cpuCoresIdle1 cpuCoresIdle1 = 0 if debug: print( "not smalljobs", "cpuCoresIdle1", cpuCoresIdle1, "cpuCoresIdle0", cpuCoresIdle0, "unsched", unsched, hostname, node, ) if debug: if (cpuCoresIdle0 + cpuCoresIdle1 + unsched < nCpuIdleCores): print( hostname, "err2", cpuCoresIdle0, "+", cpuCoresIdle1, "+", unsched, "<", nCpuIdleCores, "node", node, ) if debug: if part == "skylake" and unsched > 0: print("blocked core", unsched, hostname, node) if debug: if node["nCpus"] - node[ "nCpusUsed"] - unsched < 0: print("err3", unsched, hostname, node) if (debug and node["nCpus"] - node["nCpusUsed"] - (cpuCoresIdle0 + cpuCoresIdle1 + unsched) != 0): print( "unsched", "free - (c0+c1+un)", node["nCpus"] - node["nCpusUsed"] - (cpuCoresIdle0 + cpuCoresIdle1 + unsched), "cpuCoresIdle0", cpuCoresIdle0, "cpuCoresIdle1", cpuCoresIdle1, "unsched", unsched, hostname, node, ) # affinity kicks in here: # 16,32 core jobs need whole (16 core) sockets # <16 core jobs need to be on 1 socket # eg. # 2,4 free cores means max 1x4 and 1x2 core jobs can run, but not a 6 # 2,14 free cores means max 1x14 and 1x2 core jobs can run, but not a 16 # 8,8 free cores means max 2x8 core jobs can run, but not a 16 # 10,6 free cores means 1x10 and 1x6 can run but not a 16 # 10,8 free cores means 1x18 job (no affinity set) # 16,0 free cores means 1x16 job # 17,1 free cores means 1x18 job (no affinity set) # 18,0 free cores means 1x18 job (no affinity set) # 16,8 free cores means 1x24 job (no affinity set) # 15,15 free cores means 1x30 job (no affinity set) # it's never eg. 18,0 free 'cos of gpu reserved cores # # so: # if >16 total free then that's the job size # else it's the per-socket numbers c = cpuCoresIdle0 + cpuCoresIdle1 if c > 0: m = (node["realMem"] - node["allocMem"]) / c d = (node["realDisk"] - node["allocDisk"]) / c # cores/nodes can be unavail because of being out of ram or disk too if d <= 100 or m <= 100: # 100 MB per core unsched += cpuCoresIdle0 + cpuCoresIdle1 cpuCoresIdle0 = 0 cpuCoresIdle1 = 0 if debug: print("mem/disk blocked:", hostname, m, d) else: mint = minFutureJobTime( node, data["jobs"], hostname, data["res"]) if c > 16: bc.add(c, m, d, mint, hostname) else: if cpuCoresIdle0 > 0: bc.add(cpuCoresIdle0, m, d, mint, hostname) if cpuCoresIdle1 > 0: bc.add(cpuCoresIdle1, m, d, mint, hostname) else: # not skylake if part != "skylake-gpu": # sstar, gstar, knl avail if nCpuIdleCores > 0: m = (node["realMem"] - node["allocMem"]) / nCpuIdleCores d = (node["realDisk"] - node["allocDisk"]) / nCpuIdleCores # cores/nodes can be unavail because of being out of ram or disk too if d <= 100 or m <= 100: # 100 MB per core unsched += nCpuIdleCores nCpuIdleCores = 0 if debug: print("mem/disk blocked:", hostname, m, d) else: mint = minFutureJobTime( node, data["jobs"], hostname, data["res"]) bc.add(nCpuIdleCores, m, d, mint, hostname) # cores if part == "skylake-gpu": c = nGpuIdleCores - unsched if c > 0: m = (node["realMem"] - node["allocMem"]) / c d = (node["realDisk"] - node["allocDisk"]) / c # cores/nodes can be unavail because of being out of ram or disk too if d <= 100 or m <= 100: # 100 MB per core unsched = nGpuIdleCores usage["cores"]["i"] += nGpuIdleCores - unsched usage["cores"]["t"] += gpuCoresMax usage["cores"]["b"] += unsched elif part == "skylake": usage["cores"][ "i"] += cpuCoresIdle0 + cpuCoresIdle1 usage["cores"]["t"] += cpuCoresMax usage["cores"]["b"] += unsched else: usage["cores"]["i"] += nCpuIdleCores usage["cores"]["t"] += cpuCoresMax usage["cores"]["b"] += unsched usage["cores"]["a"] += node[ty] # nodes if node[ty] == 0: usage["nodes"]["i"] += 1 else: usage["nodes"]["a"] += 1 usage["nodes"]["t"] += 1 # gpus if part != "skylake": # if there are any cores free then gpu jobs can run if node["nCpus"] - node["nCpusUsed"] > 0: usage["gpus"][ "i"] += node["nGpus"] - node["nGpusUsed"] else: if debug: if (part == "skylake-gpu" and node["nGpus"] - node["nGpusUsed"] > 0): print( "blocked gpu", node["nGpus"] - node["nGpusUsed"], node, ) usage["gpus"]["b"] += (node["nGpus"] - node["nGpusUsed"] ) # blocked/unavail # if part == 'skylake-gpu' and node['nGpus'] - node['nGpusUsed'] != 0: # print(hostname, node['nGpus'], node['nGpusUsed'], node['nGpus'] - node['nGpusUsed']) usage["gpus"]["a"] += node["nGpusUsed"] usage["gpus"]["t"] += node["nGpus"] else: # non-avail nodes - drained, draining, down, reserved (& unused) # cores if part == "skylake-gpu": usage["cores"]["o"] += nGpuIdleCores else: usage["cores"]["o"] += nCpuIdleCores usage["cores"]["a"] += node[ty] usage["cores"]["t"] += node[ty] # nodes if node["nCpusUsed"] > 0: usage["nodes"]["a"] += 1 usage["nodes"]["o"] += 1 usage["nodes"]["t"] += 1 # gpus if part != "skylake": usage["gpus"][ "o"] += node["nGpus"] - node["nGpusUsed"] usage["gpus"]["a"] += node["nGpusUsed"] usage["gpus"]["t"] += node["nGpusUsed"] if debug: if (node["nCpus"] == 0 and node["state"] != "IDLE" and node["avail"]): print( "state not idle or unavail, yet 0 cores used", hostname, node, ) if (node["nGpusUsed"] > 0 and node["nGpuCores"] == 0 or node["nGpusUsed"] == 0 and node["nGpuCores"] > 0): print("gpu count mismatch", hostname, node) # pretty often be transiently wrong if node["nCpusUsed"] != (node["nGpuCores"] + node["nCpuCores"]): print("cpu count mismatch", hostname, node) # if debug: # print(part, usage) # for a, b in usage.items(): # s = 0 # for i, j in b.items(): # if i == 't' or i == 'o': # continue # s += j # if s != b['t'] - b['o']: # print('usage total wrong', 'sum', s, part, a, b) u[part] = usage bcu[part] = bc if debug: print(bc.data, "sum", bc.sum()) # print(bc.data, 'sum', bc.sum()) return u, bcu