def get_lnet_data_file(fn, k1, k2, samples, histories): try: ts = tspl.TSPLSum(fn, k1, k2) except tspl.TSPLException as e: return histories[ts.j.id] = tspl_utils.global_interp_data(ts, samples)
def setup(self, job_data): self.aggregate = True self.min_time = 3600 self.min_hosts = 1 self.waynesses=[x+1 for x in range(32)] self.ignore_qs = [] self.metric = float("nan") try: if self.aggregate: self.ts=tspl.TSPLSum("",self.k1,self.k2,job_data=job_data) else: self.ts=tspl.TSPLBase("",self.k1,self.k2,job_data=job_data) except tspl.TSPLException as e: return False except EOFError as e: print('End of file found reading: ' + job_path) return False if not tspl_utils.checkjob(self.ts,self.min_time, self.waynesses,skip_queues=self.ignore_qs): return False elif self.ts.numhosts < self.min_hosts: return False else: return True
def __init__(self,jobid,k1,k2,aggregate=True,stats=None): ## Build ts and ld object for a job self.k1=k1 self.k2=k2 self.jobid=jobid self.aggregate=aggregate try: if self.aggregate: self.ts=tspl.TSPLSum(jobid,self.k1,self.k2,job_data=stats) else: self.ts=tspl.TSPLBase(jobid,self.k1,self.k2,job_data=stats) if not self.ld: self.ld=lariat_utils.LariatData() self.ld.get_job(self.ts.j.id, end_epoch=self.ts.j.end_time, daysback=3, directory=lariat_path) return except tspl.TSPLException as e: return except EOFError as e: print 'End of file found reading: ' + jobid return
def type_plot(request, pk, type_name): data = get_data(pk) schema = build_schema(data, type_name) schema = [x.split(',')[0] for x in schema] k1 = {'intel': [type_name] * len(schema)} k2 = {'intel': schema} ts = tspl.TSPLSum(None, k1, k2, job_stats=data) nr_events = len(schema) fig, axarr = plt.subplots(nr_events, sharex=True, figsize=(8, nr_events * 2), dpi=80) do_rate = True for i in range(nr_events): if type_name == 'mem': do_rate = False mp.plot_lines(axarr[i], ts, [i], 3600., do_rate=do_rate) axarr[i].set_ylabel(schema[i], size='small') axarr[-1].set_xlabel("Time (hr)") fig.subplots_adjust(hspace=0.0) fig.tight_layout() return figure_to_response(fig)
def main(): parser = argparse.ArgumentParser(description='Plot MemUsed-AnonPages for jobs') parser.add_argument('filearg', help='File, directory, or quoted' ' glob pattern', nargs='?',default='jobs') n=parser.parse_args() filelist=tspl_utils.getfilelist(n.filearg) for file in filelist: try: ts=tspl.TSPLSum(file,['mem','mem'],['MemUsed','AnonPages']) except tspl.TSPLException as e: continue if not tspl_utils.checkjob(ts,3600,16): continue else: print ts.j.id fig=plt.figure() ax=fig.gca() ax.hold=True for k in ts.j.hosts.keys(): m=ts.data[0][k][0]-ts.data[1][k][0] m-=ts.data[0][k][0][0] ax.plot(ts.t/3600.,m) ax.set_ylabel('MemUsed - AnonPages ' + ts.j.get_schema(ts.k1[0])[ts.k2[0]].unit) ax.set_xlabel('Time (hr)') plt.suptitle(ts.title) fname='graph_'+ts.j.id+'_'+ts.k1[0]+'_'+ts.k2[0]+'.png' fig.savefig(fname) plt.close()
def get_samples(fn, times): try: ts = tspl.TSPLSum(fn, ['lnet'], ['tx_bytes']) except tspl.TSPLException as e: return times.append(sorted(list(ts.j.times)))
def main(): parser = argparse.ArgumentParser( description='Plot a key pair for some jobs') parser.add_argument('-t', help='Threshold', metavar='thresh') parser.add_argument('key1', help='First key', nargs='?', default='amd64_core') parser.add_argument('key2', help='Second key', nargs='?', default='SSE_FLOPS') parser.add_argument('filearg', help='File, directory, or quoted' ' glob pattern', nargs='?', default='jobs') parser.add_argument('-f', help='Set full mode', action='store_true') parser.add_argument('-m', help='Set heatmap mode', action='store_true') parser.add_argument('--max', help='Use max instead of mean', action='store_true') n = parser.parse_args() filelist = tspl_utils.getfilelist(n.filearg) if n.max: func = max else: func = scipy.stats.tmean for file in filelist: try: if n.f: full = '_full' ts = tspl.TSPLBase(file, [n.key1], [n.key2]) else: full = '' ts = tspl.TSPLSum(file, [n.key1], [n.key2]) except tspl.TSPLException as e: continue if not tspl_utils.checkjob(ts, 3600, 16): continue reduction = [] # place to store reductions via func for v in ts: rate = numpy.divide(numpy.diff(v), numpy.diff(ts.t)) reduction.append(func(rate)) m = func(reduction) if not n.t or m > float(n.t): print ts.j.id + ': ' + str(m) if n.m: heatmap(ts, n, m, full) else: lineplot(ts, n, m, full) else: print ts.j.id + ': under threshold, ' + str(m) + ' < ' + n.t
def compute_imbalance(file, k1, k2, threshold, plot_flag, full_flag, ratios): try: if full_flag: full = '_full' ts = tspl.TSPLBase(file, k1, k2) else: full = '' ts = tspl.TSPLSum(file, k1, k2) except tspl.TSPLException as e: return except EOFError as e: print 'End of file found reading: ' + file return ignore_qs = ['gpu', 'gpudev', 'vis', 'visdev'] if not tspl_utils.checkjob(ts, 3600, 16, ignore_qs): # 1 hour, 16way only return elif ts.numhosts < 2: # At least 2 hosts print ts.j.id + ': 1 host' return tmid = (ts.t[:-1] + ts.t[1:]) / 2.0 rng = range(1, len(tmid)) # Throw out first and last tmid = tmid[rng] maxval = numpy.zeros(len(rng)) minval = numpy.ones(len(rng)) * 1e100 rate = [] for v in ts: rate.append(numpy.divide(numpy.diff(v)[rng], numpy.diff(ts.t)[rng])) maxval = numpy.maximum(maxval, rate[-1]) minval = numpy.minimum(minval, rate[-1]) vals = [] mean = [] std = [] for j in range(len(rng)): vals.append([]) for v in rate: vals[j].append(v[j]) mean.append(scipy.stats.tmean(vals[j])) std.append(scipy.stats.tstd(vals[j])) imbl = maxval - minval ratio = numpy.divide(std, mean) ratio2 = numpy.divide(imbl, maxval) var = scipy.stats.tmean(ratio) # mean of ratios is the threshold statistic # Save away a list of ratios per user ratios[ts.j.id] = [var, ts.owner] print ts.j.id + ': ' + str(var) # If over the threshold, plot this job (This should be factored out) if plot_flag and abs(var) > threshold: fig, ax = plt.subplots(2, 1, figsize=(8, 8), dpi=80) plot_ratios(ts, tmid, ratio, ratio2, rate, var, fig, ax, full)
def main(): parser = argparse.ArgumentParser( description='Deal with a directory of pickle' ' files nightly') parser.add_argument('-p', help='Set number of processes', nargs=1, type=int, default=[1]) parser.add_argument('threshold', help='Treshold ratio for std dev:mean', nargs='?', default=0.25) parser.add_argument('filearg', help='File, directory, or quoted' ' glob pattern', nargs='?', default='jobs') n = parser.parse_args() filelist = tspl_utils.getfilelist(n.filearg) pool = multiprocessing.Pool(processes=n.p[0]) m = multiprocessing.Manager() ratios = m.dict() partial_imbal = functools.partial(imbalance.compute_imbalance, k1=['amd64_core'], k2=['SSE_FLOPS'], threshold=float(n.threshold), plot_flag=False, full_flag=False, ratios=ratios) pool.map(partial_imbal, filelist) badfiles = [] th = [] for i in ratios.keys(): v = ratios[i][0] if v > float(n.threshold): for f in filelist: if re.search(i, f): badfiles.append(f) th.append(v) pool.map(do_mp, zip(badfiles, th)) # Pool.starmap should exist.... bad_users = imbalance.find_top_users(ratios) for file in badfiles: try: ts = tspl.TSPLSum(file, ['amd64_core', 'cpu'], ['SSE_FLOPS', 'user']) except tspl.TSPLException as e: continue uncorrelated.plot_correlation(ts, uncorrelated.pearson(ts), '')
def main(): parser = argparse.ArgumentParser() parser.add_argument('-f', help='Set full mode', action='store_true') parser.add_argument('key1', help='First key', nargs='?', default='amd64_core') parser.add_argument('key2', help='Second key', nargs='?', default='SSE_FLOPS') parser.add_argument('filearg', help='File, directory, or quoted' ' glob pattern', nargs='?', default='jobs') n = parser.parse_args() filelist = tspl_utils.getfilelist(n.filearg) for file in filelist: try: if n.f: full = '_full' ts = tspl.TSPLBase(file, [n.key1], [n.key2]) else: full = '' ts = tspl.TSPLSum(file, [n.key1], [n.key2]) except tspl.TSPLException as e: continue if not tspl_utils.checkjob(ts, 3600, 16): # 1 hour, 16way only continue elif ts.numhosts < 2: # At least 2 hosts print ts.j.id + ': 1 host' continue print ts.j.id fig, ax = plt.subplots(1, 1, figsize=(8, 6), dpi=80) xmin, xmax = [0., 0.] for v in ts: rate = numpy.divide(numpy.diff(v), numpy.diff(ts.t)) xmin, xmax = [min(xmin, min(rate)), max(xmax, max(rate))] ax.hold = True ax.plot(rate[1:], rate[:-1], '.') ax.set_ylim(bottom=xmin, top=xmax) ax.set_xlim(left=xmin, right=xmax) fname = '_'.join( ['graph', ts.j.id, ts.k1[0], ts.k2[0], 'phase' + full]) fig.savefig(fname) plt.close()
def do_check(f, jobs): try: ts = tspl.TSPLSum(f, ['amd64_core'], ['SSE_FLOPS']) except tspl.TSPLException: return if not tspl_utils.checkjob(ts, 3600, range(1, 33)): # 1 hour return ld = lariat_utils.LariatData(ts.j.id, ts.j.end_time, analyze_conf.lariat_path) jobs[ts.j.id] = ld.exc
def main(): parser = argparse.ArgumentParser(description='Look for lack of correlation' ' between two key pairs/') parser.add_argument('threshold', help='Treshold Pearson R', nargs='?', default=0.8) parser.add_argument('keya1', help='Key A1', nargs='?', default='amd64_core') parser.add_argument('keya2', help='Key A2', nargs='?', default='DCSF') parser.add_argument('keyb1', help='Key B1', nargs='?', default='amd64_core') parser.add_argument('keyb2', help='Key B2', nargs='?', default='SSE_FLOPS') parser.add_argument('filearg', help='File, directory, or quoted' ' glob pattern', nargs='?', default='jobs') parser.add_argument('-f', help='Set full mode', action='store_true') n = parser.parse_args() filelist = tspl_utils.getfilelist(n.filearg) threshold = n.threshold k1 = [n.keya1, n.keyb1] k2 = [n.keya2, n.keyb2] for file in filelist: try: if n.f: full = '_full' ts = tspl.TSPLBase(file, k1, k2) else: full = '' ts = tspl.TSPLSum(file, k1, k2) except tspl.TSPLException as e: continue if not tspl_utils.checkjob(ts, 3600, 16): continue r = pearson(ts) print ts.j.id + ': ' + str(r) if abs(r) < float(threshold): plot_correlation(ts, r, full)
def do_un(arg): file, output_dir = arg k1 = { 'amd64': ['amd64_core', 'cpu'], 'intel_snb': ['intel_snb', 'cpu'], } k2 = { 'amd64': ['SSE_FLOPS', 'user'], 'intel_snb': ['LOAD_L1D_ALL', 'user'], } try: ts = tspl.TSPLSum(file, k1, k2) except tspl.TSPLException as e: return uncorrelated.plot_correlation(ts, uncorrelated.pearson(ts), '', output_dir)
def main(): parser = argparse.ArgumentParser( description='Dump CSV for a key pair for some jobs') parser.add_argument('-k1', help='Set first key', nargs='+', type=str, default=['amd64_sock']) parser.add_argument('-k2', help='Set second key', nargs='+', type=str, default=['DRAM']) parser.add_argument('-f', help='File, directory, or quoted' ' glob pattern', nargs=1, type=str, default=['jobs']) n = parser.parse_args() filelist = tspl_utils.getfilelist(n.f[0]) for file in filelist: try: ts = tspl.TSPLSum(file, n.k1, n.k2) except tspl.TSPLException as e: continue if not tspl_utils.checkjob(ts, 0, 16): continue elif ts.numhosts < 2: print ts.j.id + ': 1 host' continue tmid = (ts.t[:-1] + ts.t[1:]) / 2.0 for k in ts.j.hosts.keys(): rates = [ numpy.divide(numpy.diff(ts.data[x][k][0]), numpy.diff(ts.t)) for x in range(len(ts.data)) ] for i in range(len(tmid)): v = [rates[x][i] for x in range(len(ts.data))] print ','.join([ts.j.id, k, str(tmid[i])] + [str(x) for x in v])
def fit_step(fn,k1,k2,genplot=False,res={}): try: ts=tspl.TSPLSum(fn,k1,k2) except tspl.TSPLException as e: return ignore_qs=['gpu','gpudev','vis','visdev'] if not tspl_utils.checkjob(ts,3600,range(1,33),ignore_qs): return elif ts.numhosts < 2: # At least 2 hosts print ts.j.id + ': 1 host' bad_hosts=tspl_utils.lost_data(ts) if len(bad_hosts) > 0: print ts.j.id, ': Detected hosts with bad data: ', bad_hosts return vals=[] for i in [x + 2 for x in range(ts.size-4)]: vals.append(compute_fit_params(ts,i)) vals2=[] for v in vals: vals2.append([ b/a for (a,b) in v]) arr=numpy.array(vals2) brr=numpy.transpose(arr) (m,n)=numpy.shape(brr) if genplot: fig,ax=plt.subplots(1,1,dpi=80) ax.hold=True for i in range(m): ax.semilogy(brr[i,:]) fig.savefig('foo.pdf') plt.close() r=[] for i in range(m): jnd=numpy.argmin(brr[i,:]) r.append((jnd,brr[i,jnd])) res[fn]=r
def main(): parser = argparse.ArgumentParser( description='Dump CSV for a key pair for some jobs') parser.add_argument('key1', help='First key', nargs='?', default='amd64_core') parser.add_argument('key2', help='Second key', nargs='?', default='SSE_FLOPS') parser.add_argument('filearg', help='File, directory, or quoted' ' glob pattern', nargs='?', default='jobs') n = parser.parse_args() filelist = tspl_utils.getfilelist(n.filearg) print sys.argv[3] for file in filelist: try: ts = tspl.TSPLSum(file, [n.key1], [n.key2]) except tspl.TSPLException as e: continue if not tspl_utils.checkjob(ts, 3600, 16): continue elif ts.numhosts < 2: print ts.j.id + ': 1 host' continue tmid = (ts.t[:-1] + ts.t[1:]) / 2.0 rate = {} for k in ts.j.hosts.keys(): rate[k] = numpy.divide(numpy.diff(ts.data[0][k][0]), numpy.diff(ts.t)) for i in range(len(tmid)): print ','.join([ts.j.id, k, str(tmid[i]), str(rate[k][i])])
def isidle(file, thresh): k1 = { 'amd64': ['amd64_core', 'amd64_sock', 'cpu'], 'intel_snb': ['intel_snb', 'intel_snb', 'cpu'], } k2 = { 'amd64': ['SSE_FLOPS', 'DRAM', 'user'], 'intel_snb': ['SIMD_D_256', 'LOAD_L1D_ALL', 'user'], } try: ts = tspl.TSPLSum(file, k1, k2) except tspl.TSPLException as e: return ignore_qs = ['gpu', 'gpudev', 'vis', 'visdev'] if not tspl_utils.checkjob(ts, 3600, range(1, 33), ignore_qs): return elif ts.numhosts < 2: # At least 2 hosts print ts.j.id + ': 1 host' return mr = [] for i in range(len(k1)): maxrate = numpy.zeros(len(ts.t) - 1) for h in ts.j.hosts.keys(): rate = numpy.divide(numpy.diff(ts.data[i][h]), numpy.diff(ts.t)) maxrate = numpy.maximum(rate, maxrate) mr.append(maxrate) sums = [] for i in range(len(k1)): for h in ts.j.hosts.keys(): rate = numpy.divide(numpy.diff(ts.data[i][h]), numpy.diff(ts.t)) sums.append( numpy.sum(numpy.divide(mr[i] - rate, mr[i])) / (len(ts.t) - 1)) sums = [0. if math.isnan(x) else x for x in sums] if max(sums) > thresh: return True else: return False
def has_highbw(file,thresh): try: k1=['intel_snb_imc', 'intel_snb_imc'] k2=['CAS_READS', 'CAS_WRITES'] peak = 76.*1.e9 try: ts=tspl.TSPLSum(file,k1,k2) except tspl.TSPLException as e: return ignore_qs=['gpu','gpudev','vis','visdev'] if not tspl_utils.checkjob(ts,3600,range(1,33),ignore_qs): return elif ts.numhosts < 2: # At least 2 hosts print ts.j.id + ': 1 host' return gdramrate = numpy.zeros(len(ts.t)-1) for h in ts.j.hosts.keys(): gdramrate += numpy.divide(numpy.diff(64.*ts.assemble([0,1],h,0)), numpy.diff(ts.t)) mdr=scipy.stats.tmean(gdramrate)/ts.numhosts print mdr/peak #print [ts.j.id,mfr/peak[0],mdr/peak[1],mcr/peak[2]] if mdr/peak > thresh: return True else: return False except Exception as e: import sys exc_type, exc_obj, exc_tb = sys.exc_info() fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1] print(exc_type, fname, exc_tb.tb_lineno) raise e
def mem_usage(file): try: ts = tspl.TSPLSum(file, ['mem'], ['MemUsed']) except tspl.TSPLException as e: print e return [] ld = lariat_utils.LariatData(ts.j.id, ts.j.end_time, analyze_conf.lariat_path) mem_max = 0. for host in ts.j.hosts.keys(): mem_max = max(numpy.max(ts.data[0][host]), mem_max) mem_per_core = mem_max / (1024. * 1024. * 1024. * float(ts.wayness)) print ts.j.id, ': ', mem_per_core, ts.wayness, ld.threads if (int(ts.wayness) * int(ld.threads)) > 16: print ts.j.id, 'used more than one thread per core!' if (int(ts.wayness)*int(ld.threads)) <= 16 and \ (int(ts.wayness)*int(ld.threads)) > 0 : return [mem_per_core] else: return []
def main(): parser = argparse.ArgumentParser(description='Look for high meta data rate'\ ' to Lustre') parser.add_argument('-t', metavar='thresh', help='Treshold metadata rate', nargs=1, default=[100000.]) parser.add_argument('filearg', help='File, directory, or quoted' ' glob pattern', nargs='?', default='jobs') n = parser.parse_args() thresh = float(n.t[0]) print thresh filelist = tspl_utils.getfilelist(n.filearg) # k1=['llite', 'llite', 'llite', 'llite', 'llite', # 'llite', 'llite', 'llite', 'llite', 'llite', # 'llite', 'llite', 'llite', 'llite', 'llite', # 'llite', 'llite', 'llite', 'llite', 'llite', # 'llite', 'llite', 'llite', 'llite', 'llite', # 'llite'] # k2=['open','close','mmap','seek','fsync','setattr', # 'truncate','flock','getattr','statfs','alloc_inode', # 'setxattr','getxattr',' listxattr', # 'removexattr', 'inode_permission', 'readdir', # 'create','lookup','link','unlink','symlink','mkdir', # 'rmdir','mknod','rename',] k1 = [ 'llite', 'llite', 'llite', 'llite', 'llite', 'llite', 'llite', 'llite', 'llite', 'llite', 'llite', 'llite', 'llite', 'llite', 'llite', 'llite', 'llite', 'llite', 'llite', 'llite', 'llite', 'llite', 'llite', ] k2 = [ 'open', 'close', 'mmap', 'fsync', 'setattr', 'truncate', 'flock', 'getattr', 'statfs', 'alloc_inode', 'setxattr', ' listxattr', 'removexattr', 'readdir', 'create', 'lookup', 'link', 'unlink', 'symlink', 'mkdir', 'rmdir', 'mknod', 'rename', ] for file in filelist: try: ts = tspl.TSPLSum(file, k1, k2) except tspl.TSPLException as e: continue if not tspl_utils.checkjob(ts, 3600., range(1, 33)): continue tmid = (ts.t[:-1] + ts.t[1:]) / 2.0 ld = lariat_utils.LariatData(ts.j.id, ts.j.end_time, 'lariatData') meta_rate = numpy.zeros_like(tmid) for k in ts.j.hosts.keys(): meta_rate += numpy.diff(ts.assemble(range(0, len(k1)), k, 0)) / numpy.diff(ts.t) meta_rate /= float(ts.numhosts) if numpy.max(meta_rate) > thresh: title = ts.title if ld.exc != 'unknown': title += ', E: ' + ld.exc.split('/')[-1] fig, ax = plt.subplots(1, 1, figsize=(10, 8), dpi=80) plt.subplots_adjust(hspace=0.35) plt.suptitle(title) markers = ('o', 'x', '+', '^', 's', '8', 'p', 'h', '*', 'D', '<', '>', 'v', 'd', '.') colors = ('b', 'g', 'r', 'c', 'm', 'k', 'y') cnt = 0 for v in ts.data: for host in v: for vals in v[host]: rate = numpy.diff(vals) / numpy.diff(ts.t) c = colors[cnt % len(colors)] m = markers[cnt % len(markers)] # print cnt,(cnt % len(colors)), (cnt % len(markers)), k2[cnt], c, m ax.plot(tmid / 3600., rate, marker=m, markeredgecolor=c, linestyle='-', color=c, markerfacecolor='None', label=k2[cnt]) ax.hold = True cnt = cnt + 1 ax.set_ylabel('Meta Data Rate (op/s)') tspl_utils.adjust_yaxis_range(ax, 0.1) handles, labels = ax.get_legend_handles_labels() new_handles = {} for h, l in zip(handles, labels): new_handles[l] = h box = ax.get_position() ax.set_position([box.x0, box.y0, box.width * 0.9, box.height]) ax.legend(new_handles.values(), new_handles.keys(), prop={'size': 8}, bbox_to_anchor=(1.05, 1), borderaxespad=0., loc=2) fname = '_'.join(['metadata', ts.j.id, ts.owner]) fig.savefig(fname) plt.close()
def getqueue(file, queue): try: k1 = [ 'intel_snb_imc', 'intel_snb_imc', 'intel_snb', 'lnet', 'lnet', 'ib_sw', 'ib_sw', 'cpu', 'intel_snb', 'intel_snb', 'mem' ] k2 = [ 'CAS_READS', 'CAS_WRITES', 'LOAD_L1D_ALL', 'rx_bytes', 'tx_bytes', 'rx_bytes', 'tx_bytes', 'user', 'SSE_D_ALL', 'SIMD_D_256', 'MemUsed' ] try: ts = tspl.TSPLSum(file, k1, k2) except tspl.TSPLException as e: return if ts.queue != queue: return if not tspl_utils.checkjob(ts, 1., range(1, 33)): return tmid = (ts.t[:-1] + ts.t[1:]) / 2.0 dt = numpy.diff(ts.t) dram_rate = numpy.zeros_like(tmid) l1_rate = numpy.zeros_like(tmid) lnet_rate = numpy.zeros_like(tmid) ib_rate = numpy.zeros_like(tmid) user_rate = numpy.zeros_like(tmid) flops_rate = numpy.zeros_like(tmid) mem_usage = numpy.zeros_like(tmid) for host in ts.j.hosts.keys(): dram_rate += numpy.diff(ts.assemble([0, 1], host, 0)) / dt l1_rate += numpy.diff(ts.assemble([2], host, 0)) / dt lnet_rate += numpy.diff(ts.assemble([3, 4], host, 0)) / dt ib_rate += numpy.diff(ts.assemble([5, 6, -3, -4], host, 0)) / dt user_rate += numpy.diff(ts.assemble([7], host, 0)) / dt flops_rate += numpy.diff(ts.assemble([8, 9], host, 0)) / dt v = ts.assemble([10], host, 0) mem_usage += (v[:-1] + v[1:]) / 2.0 dram_rate /= float(ts.numhosts) * 1024. * 1024. * 1024. / 64. l1_rate /= float(ts.numhosts) * 1024. * 1024. / 64. lnet_rate /= float(ts.numhosts) * 1e6 ib_rate /= float(ts.numhosts) * 1e6 user_rate /= float(ts.numhosts) * 100. * ts.wayness flops_rate /= float(ts.numhosts) * 1e9 mem_usage /= float(ts.numhosts) * (1024. * 1024. * 1024.) min_dram_rate = numpy.min(dram_rate) max_dram_rate = numpy.max(dram_rate) mean_dram_rate = numpy.mean(dram_rate) min_l1_rate = numpy.min(l1_rate) max_l1_rate = numpy.max(l1_rate) mean_l1_rate = numpy.mean(l1_rate) min_lnet_rate = numpy.min(lnet_rate) max_lnet_rate = numpy.max(lnet_rate) mean_lnet_rate = numpy.mean(lnet_rate) min_ib_rate = numpy.min(ib_rate) max_ib_rate = numpy.max(ib_rate) mean_ib_rate = numpy.mean(ib_rate) min_user_rate = numpy.min(user_rate) max_user_rate = numpy.max(user_rate) mean_user_rate = numpy.mean(user_rate) min_flops_rate = numpy.min(flops_rate) max_flops_rate = numpy.max(flops_rate) mean_flops_rate = numpy.mean(flops_rate) min_mem_usage = numpy.min(mem_usage) max_mem_usage = numpy.max(mem_usage) mean_mem_usage = numpy.mean(mem_usage) return (ts.j.acct['end_time'], min_dram_rate, max_dram_rate, mean_dram_rate, min_l1_rate, max_l1_rate, mean_l1_rate, min_lnet_rate, max_lnet_rate, mean_lnet_rate, min_ib_rate, max_ib_rate, mean_ib_rate, min_user_rate, max_user_rate, mean_user_rate, min_flops_rate, max_flops_rate, mean_flops_rate, min_mem_usage, max_mem_usage, mean_mem_usage, ts.j.id) except Exception as e: import sys exc_type, exc_obj, exc_tb = sys.exc_info() fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1] print(exc_type, fname, exc_tb.tb_lineno) raise e
def master_plot(file, mode='lines', threshold=False, output_dir='.', prefix='graph', mintime=3600, wayness=16, header='Master', lariat_dict=None, wide=False, job_stats=None): k1 = { 'amd64': [ 'amd64_core', 'amd64_core', 'amd64_sock', 'lnet', 'lnet', 'ib_sw', 'ib_sw', 'cpu' ], 'intel': [ 'intel_pmc3', 'intel_pmc3', 'intel_pmc3', 'lnet', 'lnet', 'ib_ext', 'ib_ext', 'cpu', 'mem', 'mem' ], 'intel_snb': [ 'intel_snb_imc', 'intel_snb_imc', 'intel_snb', 'lnet', 'lnet', 'ib_sw', 'ib_sw', 'cpu', 'intel_snb', 'intel_snb', 'mem', 'mem' ], } k2 = { 'amd64': [ 'SSE_FLOPS', 'DCSF', 'DRAM', 'rx_bytes', 'tx_bytes', 'rx_bytes', 'tx_bytes', 'user' ], 'intel': [ 'MEM_LOAD_RETIRED_L1D_HIT', 'FP_COMP_OPS_EXE_X87', 'INSTRUCTIONS_RETIRED', 'rx_bytes', 'tx_bytes', 'port_recv_data', 'port_xmit_data', 'user', 'MemUsed', 'AnonPages' ], 'intel_snb': [ 'CAS_READS', 'CAS_WRITES', 'LOAD_L1D_ALL', 'rx_bytes', 'tx_bytes', 'rx_bytes', 'tx_bytes', 'user', 'SSE_D_ALL', 'SIMD_D_256', 'MemUsed', 'AnonPages' ], } try: print file ts = tspl.TSPLSum(file, k1, k2, job_stats) except tspl.TSPLException as e: return ignore_qs = [] #'gpu','gpudev','vis','visdev'] if not tspl_utils.checkjob(ts, mintime, wayness, ignore_qs): return if lariat_dict == None: ld = lariat_utils.LariatData(ts.j.id, end_epoch=ts.j.end_time, daysback=3, directory=analyze_conf.lariat_path) elif lariat_dict == "pass": ld = lariat_utils.LariatData(ts.j.id) else: ld = lariat_utils.LariatData(ts.j.id, olddata=lariat_dict) wayness = ts.wayness if ld.wayness != -1 and ld.wayness < ts.wayness: wayness = ld.wayness if wide: fig, ax = plt.subplots(6, 2, figsize=(15.5, 12), dpi=110) # Make 2-d array into 1-d, and reorder so that the left side is blank ax = my_utils.flatten(ax) ax_even = ax[0:12:2] ax_odd = ax[1:12:2] ax = ax_odd + ax_even for a in ax_even: a.axis('off') else: fig, ax = plt.subplots(6, 1, figsize=(8, 12), dpi=110) if mode == 'hist': plot = plot_thist elif mode == 'percentile': plot = plot_mmm else: plot = plot_lines if ts.pmc_type == 'intel_snb': # Plot key 1 plot(ax[0], ts, [8, 9], 3600., 1e9, ylabel='Total AVX +\nSSE Ginst/s') # Plot key 2 plot(ax[1], ts, [0, 1], 3600., 1.0 / 64.0 * 1024. * 1024. * 1024., ylabel='Total Mem BW GB/s') #Plot key 3 #plot(ax[2],ts,[2],3600.,1.0/64.0*1e9, ylabel='L1 BW GB/s') plot(ax[2], ts, [10, -11], 3600., 1024.0 * 1024.0 * 1024.0, ylabel='Memory Usage GB', do_rate=False) elif ts.pmc_type == 'intel': plot(ax[0], ts, [1], 3600., 1e9, ylabel='FP Ginst/s') plot(ax[2], ts, [8, -9], 3600., 1024.0 * 1024.0 * 1024.0, ylabel='Memory Usage GB', do_rate=False) else: #Fix this to support the old amd plots print ts.pmc_type + ' not supported' return # Plot lnet sum rate plot(ax[3], ts, [3, 4], 3600., 1024.**2, ylabel='Total lnet MB/s') # Plot remaining IB sum rate if ts.pmc_type == 'intel_snb': plot(ax[4], ts, [5, 6, -3, -4], 3600., 1024.**2, ylabel='Total (ib_sw-lnet) MB/s') elif ts.pmc_type == 'intel': plot(ax[4], ts, [5, 6, -3, -4], 3600., 1024.**2, ylabel='Total (ib_ext-lnet) MB/s') #Plot CPU user time plot(ax[5], ts, [7], 3600., wayness * 100., xlabel='Time (hr)', ylabel='Total cpu user\nfraction') print ts.j.id + ': ' plt.subplots_adjust(hspace=0.35) if wide: left_text = header + '\n' + my_utils.summary_text(ld, ts) text_len = len(left_text.split('\n')) fontsize = ax[0].yaxis.label.get_size() linespacing = 1.2 fontrate = float(fontsize * linespacing) / 72. / 15.5 yloc = .8 - fontrate * (text_len - 1 ) # this doesn't quite work. fontrate is too # small by a small amount plt.figtext(.05, yloc, left_text, linespacing=linespacing) fname = '_'.join([prefix, ts.j.id, ts.owner, 'wide_master']) elif header != None: title = header + '\n' + ts.title if threshold: title += ', V: %(v)-6.1f' % {'v': threshold} title += '\n' + ld.title() plt.suptitle(title) fname = '_'.join([prefix, ts.j.id, ts.owner, 'master']) else: fname = '_'.join([prefix, ts.j.id, ts.owner, 'master']) if mode == 'hist': fname += '_hist' elif mode == 'percentile': fname += '_perc' plt.close() return fig, fname
def master_plot(file, threshold=False): k1 = [ 'amd64_core', 'amd64_core', 'amd64_sock', 'lnet', 'lnet', 'ib_sw', 'ib_sw', 'cpu' ] k2 = [ 'SSE_FLOPS', 'DCSF', 'DRAM', 'rx_bytes', 'tx_bytes', 'rx_bytes', 'tx_bytes', 'user' ] try: print file ts = tspl.TSPLSum(file, k1, k2) except tspl.TSPLException as e: return if not tspl_utils.checkjob(ts, 3600, 16): return elif ts.numhosts < 2: print ts.j.id + ': 1 host' return tmid = (ts.t[:-1] + ts.t[1:]) / 2.0 fig, ax = plt.subplots(6, 1, figsize=(8, 12), dpi=80) # Plot flop rate ax[0].hold = True for k in ts.j.hosts.keys(): h = ts.j.hosts[k] rate = numpy.divide(numpy.diff(ts.data[0][k][0]), numpy.diff(ts.t)) ax[0].plot(tmid / 3600, rate) ax[0].set_ylabel('Total ' + ts.k1[0] + '\n' + ts.k2[0] + '/s') # Plot DCSF rate ax[1].hold = True for k in ts.j.hosts.keys(): h = ts.j.hosts[k] rate = numpy.divide(numpy.diff(ts.data[1][k][0]), numpy.diff(ts.t)) ax[1].plot(tmid / 3600, rate) ax[1].set_ylabel('Total ' + ts.k1[1] + '\n' + ts.k2[1] + '/s') #Plot DRAM rate ax[2].hold = True for k in ts.j.hosts.keys(): h = ts.j.hosts[k] rate = numpy.divide(numpy.diff(ts.data[2][k][0]), numpy.diff(ts.t)) ax[2].plot(tmid / 3600, rate) ax[2].set_ylabel('Total ' + ts.k1[2] + '\n' + ts.k2[2] + '/s') # Plot lnet sum rate ax[3].hold = True for k in ts.j.hosts.keys(): h = ts.j.hosts[k] rate = numpy.divide(numpy.diff(ts.data[3][k][0] + ts.data[4][k][0]), numpy.diff(ts.t)) ax[3].plot(tmid / 3600, rate / (1024. * 1024.)) ax[3].set_ylabel('Total lnet MB/s') # Plot remaining IB sum rate ax[4].hold = True for k in ts.j.hosts.keys(): h = ts.j.hosts[k] v = ts.data[5][k][0] + ts.data[6][k][0] - (ts.data[3][k][0] + ts.data[4][k][0]) rate = numpy.divide(numpy.diff(v), numpy.diff(ts.t)) ax[4].plot(tmid / 3600, rate / (1024 * 1024.)) ax[4].set_ylabel('Total (ib_sw-lnet) MB/s') #Plot CPU user time ax[5].hold = True for k in ts.j.hosts.keys(): h = ts.j.hosts[k] rate = numpy.divide(numpy.diff(ts.data[7][k][0] / 100 / ts.wayness), numpy.diff(ts.t)) ax[5].plot(tmid / 3600, rate) ax[5].set_ylabel('Total ' + ts.k1[7] + '\n' + ts.k2[7] + '/s') ax[5].set_xlabel('Time (hr)') print ts.j.id + ': ' title = ts.title if threshold: title += ', V: %(v)-8.3f' % {'v': threshold} plt.suptitle(title) plt.subplots_adjust(hspace=0.35) for a in ax: tspl_utils.adjust_yaxis_range(a, 0.1) fname = '_'.join(['graph', ts.j.id, 'master']) fig.savefig(fname) plt.close()
def main(): parser = argparse.ArgumentParser() parser.add_argument('-f', help='Set full mode', action='store_true') parser.add_argument('key1', help='First key', nargs='?', default='amd64_core') parser.add_argument('key2', help='Second key', nargs='?', default='SSE_FLOPS') parser.add_argument('filearg', help='File, directory, or quoted' ' glob pattern', nargs='?',default='jobs') n=parser.parse_args() filelist=tspl_utils.getfilelist(n.filearg) for file in filelist: try: if n.f: full='_full' ts=tspl.TSPLBase(file,[n.key1],[n.key2]) else: full='' ts=tspl.TSPLSum(file,[n.key1],[n.key2]) except tspl.TSPLException as e: continue if not tspl_utils.checkjob(ts,3600,16): # 1 hour, 16way only continue elif ts.numhosts < 2: # At least 2 hosts print ts.j.id + ': 1 host' continue print ts.j.id tmid=(ts.t[:-1]+ts.t[1:])/2.0 s=[] for v in ts: s=v break fig,ax=plt.subplots(2,1,figsize=(8,6),dpi=80) ax[0].hold=True ax[1].hold=True xmin,xmax=[0.,0.] xmin1,xmax1=[0.,0.] dt=numpy.diff(ts.t) for v in ts: rate=numpy.array(numpy.divide(numpy.diff(v),dt),dtype=numpy.int64) d=numpy.linalg.norm(rate,ord=1)/float(len(rate)) xmin,xmax=[min(xmin,min(rate)),max(xmax,max(rate))] xmin1,xmax1=[min(xmin1,min(rate-d)),max(xmax1,max(rate-d))] ax[0].plot(tmid,rate) ax[1].plot(tmid,rate-d) xmin,xmax=tspl_utils.expand_range(xmin,xmax,.1) xmin1,xmax1=tspl_utils.expand_range(xmin1,xmax1,.1) ax[0].set_ylim(bottom=xmin,top=xmax) ax[1].set_ylim(bottom=xmin1,top=xmax1) fname='_'.join(['graph',ts.j.id,ts.k1[0],ts.k2[0],'adjust'+full]) fig.savefig(fname) plt.close()
def compute_ratio(file, lariat_dict=None): try: ts = tspl.TSPLSum(file, [ 'intel_snb_imc', 'intel_snb_imc', 'intel_snb', 'intel_snb', 'intel_snb', 'intel_snb', 'intel_snb' ], [ 'CAS_READS', 'CAS_WRITES', 'LOAD_L1D_ALL', 'SIMD_D_256', 'SSE_D_ALL', 'STALLS', 'CLOCKS_UNHALTED_CORE' ]) except tspl.TSPLException as e: return ignore_qs = ['gpu', 'gpudev', 'vis', 'visdev'] if not tspl_utils.checkjob(ts, 3600., range(1, 33), ignore_qs): return tmid = (ts.t[:-1] + ts.t[1:]) / 2.0 if lariat_dict == None: ld = lariat_utils.LariatData(ts.j.id, end_epoch=ts.j.end_time, daysback=3, directory=analyze_conf.lariat_path) else: ld = lariat_utils.LariatData(ts.j.id, olddata=lariat_dict) if ld.exc == 'unknown' or ld.wayness != ts.wayness: # try loading older lariat ld = lariat_utils.LariatData(ts.j.id, end_epoch=ts.j.end_time, daysback=3, directory=analyze_conf.lariat_path, olddata=ld.ld) if ld.exc == 'unknown' or ld.wayness != ts.wayness: # Still nothing; return return read_rate = numpy.zeros_like(tmid) write_rate = numpy.zeros_like(tmid) l1_rate = numpy.zeros_like(tmid) avx_rate = numpy.zeros_like(tmid) sse_rate = numpy.zeros_like(tmid) stall_rate = numpy.zeros_like(tmid) clock_rate = numpy.zeros_like(tmid) for host in ts.j.hosts.keys(): read_rate += numpy.diff(ts.assemble([0], host, 0)) / numpy.diff(ts.t) write_rate += numpy.diff(ts.assemble([1], host, 0)) / numpy.diff(ts.t) l1_rate += numpy.diff(ts.assemble([2], host, 0)) / numpy.diff(ts.t) avx_rate += numpy.diff(ts.assemble([3], host, 0)) / numpy.diff(ts.t) sse_rate += numpy.diff(ts.assemble([4], host, 0)) / numpy.diff(ts.t) stall_rate += numpy.diff(ts.assemble([5], host, 0)) / numpy.diff(ts.t) clock_rate += numpy.diff(ts.assemble([6], host, 0)) / numpy.diff(ts.t) if float(ts.numhosts * int(ts.wayness) * int(ld.threads)) == 0: print 'No tasks in', ts.j.id, ' skipping' return read_rate /= float(ts.numhosts * int(ts.wayness) * int(ld.threads)) write_rate /= float(ts.numhosts * int(ts.wayness) * int(ld.threads)) l1_rate /= float(ts.numhosts * int(ts.wayness) * int(ld.threads)) avx_rate /= float(ts.numhosts * int(ts.wayness) * int(ld.threads)) sse_rate /= float(ts.numhosts * int(ts.wayness) * int(ld.threads)) stall_rate /= float(ts.numhosts * int(ts.wayness) * int(ld.threads)) clock_rate /= float(ts.numhosts * int(ts.wayness) * int(ld.threads)) try: data_ratio = (read_rate + write_rate) / l1_rate except RuntimeWarning: print 'Division by zero, skipping:', ts.j.id return flops = avx_rate + sse_rate try: flops_ratio = (flops - numpy.min(flops)) / (numpy.max(flops) - numpy.min(flops)) except RuntimeWarning: print 'Division by zero, skipping:', ts.j.id return try: stall_ratio = stall_rate / clock_rate except RuntimeWarning: print 'Division by zero, skipping:', ts.j.id return mean_data_ratio = numpy.mean(data_ratio) mean_stall_ratio = numpy.mean(stall_ratio) mean_mem_rate = numpy.mean(read_rate + write_rate) * 64.0 if mean_stall_ratio > 1.: return elif mean_mem_rate > 75. * 1000000000. / 16.: return ename = ld.exc.split('/')[-1] ename = ld.comp_name(ename, ld.equiv_patterns) ## if mean_mem_rate > 2e9: # Put a print in here and investigate bad jobs ## return return (ts.j.id, ts.su, ename, mean_data_ratio, mean_stall_ratio, mean_mem_rate)
def do_compute(file): try: ts = tspl.TSPLSum(file, [ 'intel_snb_imc', 'intel_snb_imc', 'intel_snb', 'intel_snb', 'intel_snb', 'intel_snb', 'intel_snb' ], [ 'CAS_READS', 'CAS_WRITES', 'LOAD_L1D_ALL', 'SIMD_D_256', 'SSE_D_ALL', 'STALLS', 'CLOCKS_UNHALTED_CORE' ]) except tspl.TSPLException as e: return if not tspl_utils.checkjob(ts, 0, 16): return elif ts.numhosts < 2: print ts.j.id + ': 1 host' return ignore_qs = ['gpu', 'gpudev', 'vis', 'visdev'] if not tspl_utils.checkjob(ts, 3600., range(1, 33), ignore_qs): return ld = lariat_utils.LariatData(ts.j.id, ts.j.end_time, '/scratch/projects/lariatData') if ld.exc == 'unknown': return tmid = (ts.t[:-1] + ts.t[1:]) / 2.0 read_rate = numpy.zeros_like(tmid) write_rate = numpy.zeros_like(tmid) l1_rate = numpy.zeros_like(tmid) avx_rate = numpy.zeros_like(tmid) sse_rate = numpy.zeros_like(tmid) stall_rate = numpy.zeros_like(tmid) clock_rate = numpy.zeros_like(tmid) for host in ts.j.hosts.keys(): read_rate += numpy.diff(ts.assemble([0], host, 0)) / numpy.diff(ts.t) write_rate += numpy.diff(ts.assemble([1], host, 0)) / numpy.diff(ts.t) l1_rate += numpy.diff(ts.assemble([2], host, 0)) / numpy.diff(ts.t) avx_rate += numpy.diff(ts.assemble([3], host, 0)) / numpy.diff(ts.t) sse_rate += numpy.diff(ts.assemble([4], host, 0)) / numpy.diff(ts.t) stall_rate += numpy.diff(ts.assemble([5], host, 0)) / numpy.diff(ts.t) clock_rate += numpy.diff(ts.assemble([6], host, 0)) / numpy.diff(ts.t) read_rate /= ts.numhosts write_rate /= ts.numhosts l1_rate /= ts.numhosts avx_rate /= ts.numhosts sse_rate /= ts.numhosts stall_rate /= ts.numhosts clock_rate /= ts.numhosts data_ratio = (read_rate + write_rate) / l1_rate flops = avx_rate + sse_rate flops_ratio = (flops - numpy.min(flops)) / (numpy.max(flops) - numpy.min(flops)) stall_ratio = stall_rate / clock_rate mean_data_ratio = numpy.mean(data_ratio) mean_stall_ratio = numpy.mean(stall_ratio) mean_flops = numpy.mean(flops) ename = ld.exc.split('/')[-1] ename = ld.comp_name(ename, ld.equiv_patterns) mean_mem_rate = numpy.mean(read_rate + write_rate) if mean_mem_rate > 2e9: # Put a print in here and investigate bad jobs return return ','.join([ ts.j.id, ts.owner, ename, str(mean_mem_rate), str(mean_stall_ratio), str(mean_data_ratio), str(mean_flops) ])
def is_unfloppy(file, thresh): k1 = { 'amd64': ['amd64_core', 'amd64_sock', 'cpu'], 'intel_snb': ['intel_snb', 'intel_snb', 'intel_snb', 'cpu'], } k2 = { 'amd64': ['SSE_FLOPS', 'DRAM', 'user'], 'intel_snb': ['SIMD_D_256', 'SSE_D_ALL', 'LOAD_L1D_ALL', 'user'], } peak = { 'amd64': [2.3e9 * 16 * 2, 24e9, 1.], 'intel_snb': [16 * 2.7e9 * 2, 16 * 2.7e9 / 2. * 64., 1.], } try: ts = tspl.TSPLSum(file, k1, k2) except tspl.TSPLException as e: return ignore_qs = ['gpu', 'gpudev', 'vis', 'visdev'] if not tspl_utils.checkjob(ts, 3600, range(1, 33), ignore_qs): return elif ts.numhosts < 2: # At least 2 hosts print ts.j.id + ': 1 host' return gfloprate = numpy.zeros(len(ts.t) - 1) gdramrate = numpy.zeros(len(ts.t) - 1) gcpurate = numpy.zeros(len(ts.t) - 1) for h in ts.j.hosts.keys(): if ts.pmc_type == 'amd64': gfloprate += numpy.divide(numpy.diff(ts.data[0][h][0]), numpy.diff(ts.t)) gdramrate += numpy.divide(numpy.diff(ts.data[1][h][0]), numpy.diff(ts.t)) gcpurate += numpy.divide(numpy.diff(ts.data[2][h][0]), numpy.diff(ts.t)) elif ts.pmc_type == 'intel_snb': gfloprate += numpy.divide(numpy.diff(ts.data[0][h][0]), numpy.diff(ts.t)) gfloprate += numpy.divide(numpy.diff(ts.data[1][h][0]), numpy.diff(ts.t)) gdramrate += numpy.divide(numpy.diff(ts.data[2][h][0]), numpy.diff(ts.t)) gcpurate += numpy.divide(numpy.diff(ts.data[3][h][0]), numpy.diff(ts.t)) mfr = scipy.stats.tmean(gfloprate) / ts.numhosts mdr = scipy.stats.tmean(gdramrate) / ts.numhosts mcr = scipy.stats.tmean(gcpurate) / (ts.numhosts * ts.wayness * 100.) print mfr / peak[ts.pmc_type][0], (mdr / peak[ts.pmc_type][1]) # [ts.j.id,mfr/peak[0],mdr/peak[1],mcr/peak[2] #print 'mcr',mcr/peak[ts.pmc_type][2], (mfr/peak[ts.pmc_type][0])/(mdr/peak[ts.pmc_type][1]) if ((mcr / peak[ts.pmc_type][2] > 0.5) and (mfr / peak[ts.pmc_type][0]) / (mdr / peak[ts.pmc_type][1]) < thresh): return True else: return False