def display_node_GPU(node_name): ts = int(time.time()) node = PyslurmQuery.getNode(node_name) if not node: print("{} Node {} does not exist.".format(MyTool.getTsString(ts), node_name)) return if 'gpu' not in node['features']: print("{} Node {} does not have GPUs.".format(MyTool.getTsString(ts), node_name)) return jobs = PyslurmQuery.getNodeAllocJobs(node_name, node) gpu_total, gpu_used = MyTool.getGPUCount(node['gres'], node['gres_used']) print("{}: Node {} up {},\t{} GPUs ({} used), {} allocated jobs.".format( MyTool.getTsString(ts), node_name, datetime.timedelta(seconds=ts - node['boot_time']), gpu_total, gpu_used, len(jobs) if jobs else 0)) jid2gpu = dict( map( lambda job: (job['job_id'], PyslurmQuery.getJobAllocGPUonNode(job, node)), jobs)) if jid2gpu: job_gpu = reduce(operator.add, jid2gpu.values()) start_ts = min( [job['start_time'] for job in jobs if job['gres_detail']]) gpu_data = BrightRestClient().getNodeGPU(node_name, start_ts, job_gpu, msec=False) else: gpu_data = {} jid2job = dict(map(lambda job: (job['job_id'], job), jobs)) gid2jid = defaultdict(list) for jid, gpu_list in jid2gpu.items(): for gid in gpu_list: gid2jid[gid].append(jid) print("\t{:6}{:10}{:>20}{:>20}{:>25}".format("GPU", "Jid", "Job run time", "Job avg util", "Avg util (5,10,30min)")) for gid in range(0, gpu_total): jid_list = gid2jid[gid] if jid_list: start_ts = min([jid2job[jid]['start_time'] for jid in jid_list]) g_data = gpu_data['{}.gpu{}'.format(node_name, gid)] g_avg = MyTool.getTimeSeqAvg(g_data, start_ts, ts) g_avg1 = MyTool.getTimeSeqAvg(g_data, ts - 5 * 60, ts) g_avg2 = MyTool.getTimeSeqAvg(g_data, ts - 10 * 60, ts) g_avg3 = MyTool.getTimeSeqAvg(g_data, ts - 30 * 60, ts) print("\t{:<6}{:10}{:>20}{:>20.2f}{:>10.2f},{:>6.2f},{:>6.2f}". format(gid, str(jid_list), str(datetime.timedelta(seconds=ts - start_ts)), g_avg * 100, g_avg1 * 100, g_avg2 * 100, g_avg3 * 100)) else: print("\t{:<6}{:10}".format(gid, "IDLE"))
def display_job_GPU(jid): ts = int(time.time()) job = PyslurmQuery.getCurrJob(jid) if not job: print("{} Job {} does not exist or already stops running.".format( MyTool.getTsString(ts), jid)) return j_gpu = PyslurmQuery.getJobAllocGPU(job) #print(j_gpu) if not j_gpu: print("{} Job {} does not allocate any GPU.".format( MyTool.getTsString(ts), jid)) return print("{} Job {} of {} run for {},\talloc {} GPUs on {} GPU nodes.".format( MyTool.getTsString(ts), jid, MyTool.getUser(job['user_id']), datetime.timedelta(seconds=ts - job['start_time']), sum([len(g_lst) for g_lst in j_gpu.values()]), sum([1 for g_lst in j_gpu.values() if g_lst]))) gpu_union = reduce(lambda rlt, curr: rlt.union(set(curr)), j_gpu.values(), set()) #print(gpu_union) gpu_data = BrightRestClient().getGPU(list(j_gpu.keys()), job['start_time'], list(gpu_union), msec=False) #print(gpu_data.keys()) print("\t{:12}{:>6}{:>20}{:>25}".format("Node", "GPU", "Job avg util", "Avg util (5,10,30min)")) for node_name, gpu_list in j_gpu.items(): for gid in gpu_list: g_data = gpu_data['{}.gpu{}'.format(node_name, gid)] g_avg = MyTool.getTimeSeqAvg(g_data, job['start_time'], ts) g_avg1 = MyTool.getTimeSeqAvg(g_data, ts - 5 * 60, ts) g_avg2 = MyTool.getTimeSeqAvg(g_data, ts - 10 * 60, ts) g_avg3 = MyTool.getTimeSeqAvg(g_data, ts - 30 * 60, ts) print("\t{:12}{:6}{:>20.2f}{:>10.2f},{:>6.2f},{:>6.2f}".format( node_name, gid, g_avg * 100, g_avg1 * 100, g_avg2 * 100, g_avg3 * 100)) return
def sendUpdate(self, ts, slurmJobs, hn2data, slurmNodes): for url in self.urls: zps = zlib.compress( cPickle.dumps((ts, slurmJobs, hn2data, slurmNodes), -1)) try: logger.debug("sendUpdate to {}".format(url)) if not self.test_mode: resp = urllib2.urlopen( urllib2.Request( url, zps, {'Content-Type': 'application/octet-stream'})) else: resp = 0 logger.debug( "{}:{}: sendUpdate to {} with return code {}".format( threading.currentThread().ident, MyTool.getTsString(ts), url, resp)) except Exception as e: logger.error('Failed to update slurm data {}: {}'.format( url, e))
def dealData(self, ts, slurmJobs, slurmNodes, msgs): # faciliated data structure node2uid2cpuCnt = self.getUserAllocCPUOnNode(slurmJobs) nodeUserProcs = {} #node:state, delta, ts, [user, procs] #update the information using msg for hostname, slurmNode in slurmNodes.items( ): # need to generate a record for every host to reflect current # SLURM status, even if we don't have a msg for it. pre_ts, pre_procs, pre_nodeUserProcs = self.savNode2TsProcs[ hostname] if (hostname not in msgs) or (pre_ts >= msgs[hostname][-1]['hdr']['msg_ts']): logger.debug( "No new data of {}. Use previous data at {}".format( hostname, MyTool.getTsString(pre_ts))) if hostname in msgs: msgs.pop(hostname) logger.debug( "\tIgnore the incoming older data at {}.".format( MyTool.getTsString(msg['hdr']['msg_ts']))) if pre_ts != -1: nodeUserProcs[hostname] = pre_nodeUserProcs else: nodeUserProcs[hostname] = [ slurmNode.get('state', '?STATE?'), 0.0, ts ] else: #hostname in msgs: host_msgs = msgs.pop(hostname) msg = host_msgs[-1] # get the latest message msg_ts = msg['hdr']['msg_ts'] msg_procs = dict([(proc['pid'], proc) for proc in msg['processes']]) if len(host_msgs) > 1: pre_msg = host_msgs[-2] if (pre_ts < pre_msg['hdr']['msg_ts']) and ( pre_msg['hdr']['msg_ts'] < msg_ts): #saved value is older pre_ts = pre_msg['hdr']['msg_ts'] pre_procs = dict([(proc['pid'], proc) for proc in pre_msg['processes']]) delta = 0.0 if -1.0 == pre_ts else msg_ts - pre_ts procsByUser = self.getProcsByUser( hostname, msg_ts, msg_procs, pre_ts, pre_procs, node2uid2cpuCnt.get(hostname, {})) nodeUserProcs[hostname] = [ slurmNode.get('state', '?STATE?'), delta, msg_ts ] + procsByUser # upate savNode2TsProcs self.savNode2TsProcs[hostname] = (msg_ts, msg_procs, nodeUserProcs[hostname]) #save information to files #logger.debug("writeData {}:{}".format(ts, hostname)) if self.write_file_flag: self.hostData_dir.writeData(hostname, ts, nodeUserProcs[hostname]) else: logger.debug("simulate write to file") self.discardMessage(msgs) self.sendUpdate(ts, slurmJobs, nodeUserProcs, slurmNodes)
def display_user_GPU(user_name): ts = int(time.time()) uid = MyTool.getUid(user) if not uid: print("{} User {} does not exist.".format(MyTool.getTsString(ts), user_name)) user_job_lst = PyslurmQuery.getUserCurrJobs(uid) if not user_job_lst: print("{} User {} does not have running jobs.".format( MyTool.getTsString(ts), user_name)) return node_dict = PyslurmQuery.getAllNodes() job_gpu_d = dict([(job['job_id'], PyslurmQuery.getJobAllocGPU(job, node_dict)) for job in user_job_lst]) u_node = [ node_name for g_alloc_d in job_gpu_d.values() for node_name in g_alloc_d ] u_gpu_cnt = sum([ len(g_lst) for g_alloc_d in job_gpu_d.values() for g_lst in g_alloc_d.values() ]) g_union = reduce(lambda rlt, curr: rlt.union(set(curr)), [ g_lst for g_alloc_d in job_gpu_d.values() for g_lst in g_alloc_d.values() ], set()) print("{} User {} has {} running jobs,\talloc {} GPUs on {} GPU nodes.". format(MyTool.getTsString(ts), user_name, len(user_job_lst), u_gpu_cnt, len(u_node))) #get gpu data if u_node: #GPU nodes allocated gpu_data = BrightRestClient().getGPU(u_node, min([ job['start_time'] for job in user_job_lst if job_gpu_d[job['job_id']] ]), list(g_union), msec=False) else: gpu_data = {} print("\t{:10}{:20}{:>16}{:>20}{:>25}".format("Jid", "Job run time", "Node.GPU", "Job avg util", "Avg util (5,10,30min)")) for job in user_job_lst: jid = job['job_id'] j_run_time = str(datetime.timedelta(seconds=ts - job['start_time'])) j_first_ln = True if not job_gpu_d[jid]: # job not using GPU print("\t{:<10}{:20}{:>16}".format(job['job_id'], j_run_time, 'No GPU')) continue for node, g_lst in job_gpu_d[jid].items(): for g in g_lst: g_name = '{}.gpu{}'.format(node, g) g_data = gpu_data[g_name] g_avg = MyTool.getTimeSeqAvg(g_data, job['start_time'], ts) g_avg1 = MyTool.getTimeSeqAvg(g_data, ts - 5 * 60, ts) g_avg2 = MyTool.getTimeSeqAvg(g_data, ts - 10 * 60, ts) g_avg3 = MyTool.getTimeSeqAvg(g_data, ts - 30 * 60, ts) if j_first_ln: print( "\t{:<10}{:20}{:>16}{:>20.2f}{:>10.2f},{:>6.2f},{:>6.2f}" .format(jid, j_run_time, g_name, g_avg * 100, g_avg1 * 100, g_avg2 * 100, g_avg3 * 100)) j_first_ln = False else: print( "\t{:<10}{:20}{:>16}{:>20.2f}{:>10.2f},{:>6.2f},{:>6.2f}" .format('', '', g_name, g_avg * 100, g_avg1 * 100, g_avg2 * 100, g_avg3 * 100))