def update(meta = None): if not meta: return #Job.objects.all().delete() # Only need to populate lariat cache once jobid = meta.json.keys()[0] ld = lariat_utils.LariatData(jobid, end_epoch = meta.json[jobid]['end_epoch'], directory = sys_path_append.lariat_path, daysback = 2) for jobid, json in meta.json.iteritems(): if Job.objects.filter(id = jobid).exists(): continue ld = lariat_utils.LariatData(jobid, olddata = ld.ld) json['user'] = ld.user json['exe'] = ld.exc.split('/')[-1] json['cwd'] = ld.cwd json['run_time'] = meta.json[jobid]['end_epoch'] - meta.json[jobid]['start_epoch'] json['threads'] = ld.threads try: job_model, created = Job.objects.get_or_create(**json) except: print "Something wrong with json",json return
def ls4_update(meta=None): if not meta: return #LS4Job.objects.all().delete() # Only need to populate lariat cache once jobid = meta.json.keys()[0] ld = lariat_utils.LariatData(jobid, end_epoch=meta.json[jobid]['end_epoch'], directory=sys_path_append.lariat_path, daysback=2) for jobid, json in meta.json.iteritems(): if LS4Job.objects.filter(id=jobid).exists(): continue ld = lariat_utils.LariatData(jobid, olddata=ld.ld) if json['exit_status'] != 0: json['status'] = 'TIMEOUT/CANCELLED' else: json['status'] = 'COMPLETED' if json['failed'] != 0: json['status'] = 'FAILED' json['nodes'] = str(int(json['slots']) / 12) json['cores'] = str( int(json['granted_pe'].rstrip('way')) * int(json['nodes'])) json['run_time'] = meta.json[jobid]['end_epoch'] - meta.json[jobid][ 'start_epoch'] jsondb = {} jsondb['id'] = json['id'] jsondb['project'] = json['account'] jsondb['start_time'] = json['start_time'] jsondb['end_time'] = json['end_time'] jsondb['start_epoch'] = json['start_epoch'] jsondb['end_epoch'] = json['end_epoch'] jsondb['run_time'] = json['run_time'] jsondb['queue'] = json['queue'] jsondb['name'] = json['name'] jsondb['status'] = json['status'] jsondb['nodes'] = json['nodes'] jsondb['cores'] = json['cores'] jsondb['path'] = json['path'] jsondb['date'] = json['date'] jsondb['user'] = json['owner'] # LD jsondb['exe'] = ld.exc.split('/')[-1] jsondb['cwd'] = ld.cwd jsondb['threads'] = ld.threads try: job_model, created = LS4Job.objects.get_or_create(**jsondb) except: print "Something wrong with json", jsondb return
def __init__(self,jobid,k1,k2,aggregate=True,stats=None): ## Build ts and ld object for a job self.k1=k1 self.k2=k2 self.jobid=jobid self.aggregate=aggregate try: if self.aggregate: self.ts=tspl.TSPLSum(jobid,self.k1,self.k2,job_data=stats) else: self.ts=tspl.TSPLBase(jobid,self.k1,self.k2,job_data=stats) if not self.ld: self.ld=lariat_utils.LariatData() self.ld.get_job(self.ts.j.id, end_epoch=self.ts.j.end_time, daysback=3, directory=lariat_path) return except tspl.TSPLException as e: return except EOFError as e: print 'End of file found reading: ' + jobid return
def do_check(f, jobs): try: ts = tspl.TSPLSum(f, ['amd64_core'], ['SSE_FLOPS']) except tspl.TSPLException: return if not tspl_utils.checkjob(ts, 3600, range(1, 33)): # 1 hour return ld = lariat_utils.LariatData(ts.j.id, ts.j.end_time, analyze_conf.lariat_path) jobs[ts.j.id] = ld.exc
def getcode(file, code, output_dir): try: ts = tspl.TSPLBase(file, ['lnet'], ['rx_bytes']) except tspl.TSPLException as e: return ld = lariat_utils.LariatData(ts.j.id, ts.j.end_time, analyze_conf.lariat_path) ename = ld.exc.split('/')[-1] ename = ld.comp_name(ename, ld.equiv_patterns) if ename == code: print ts.j.id, ename, ts.wayness masterplot.master_plot(file, output_dir=output_dir, mintime=1, wayness=ts.wayness)
def plot_ratios(ts, tmid, ratio, ratio2, rate, var, fig, ax, full): # Compute y-axis min and max, expand the limits by 10% ymin = min(numpy.minimum(ratio, ratio2)) ymax = max(numpy.maximum(ratio, ratio2)) ymin, ymax = tspl_utils.expand_range(ymin, ymax, 0.1) ld = lariat_utils.LariatData(ts.j.id, ts.j.end_time, analyze_conf.lariat_path) print '---------------------' ax[0].plot(tmid / 3600, ratio) ax[0].hold = True ax[0].plot(tmid / 3600, ratio2) ax[0].legend(('Std Dev', 'Max Diff'), loc=4) ax[1].hold = True ymin1 = 0. # This is wrong in general, but we don't want the min to be > 0. ymax1 = 0. for v in rate: ymin1 = min(ymin1, min(v)) ymax1 = max(ymax1, max(v)) ax[1].plot(tmid / 3600, v) ymin1, ymax1 = tspl_utils.expand_range(ymin1, ymax1, 0.1) title = ts.title if ld.exc != 'unknown': title += ', E: ' + ld.exc.split('/')[-1] title += ', V: %(V)-8.3g' % {'V': var} plt.suptitle(title) ax[0].set_xlabel('Time (hr)') ax[0].set_ylabel('Imbalance Ratios') ax[1].set_xlabel('Time (hr)') ax[1].set_ylabel('Total ' + ts.label(ts.k1[0], ts.k2[0]) + '/s') ax[0].set_ylim(bottom=ymin, top=ymax) ax[1].set_ylim(bottom=ymin1, top=ymax1) fname = '_'.join( ['graph', ts.j.id, ts.owner, ts.k1[0], ts.k2[0], 'imbalance' + full]) fig.savefig(fname) plt.close()
def mem_usage(file): try: ts = tspl.TSPLSum(file, ['mem'], ['MemUsed']) except tspl.TSPLException as e: print e return [] ld = lariat_utils.LariatData(ts.j.id, ts.j.end_time, analyze_conf.lariat_path) mem_max = 0. for host in ts.j.hosts.keys(): mem_max = max(numpy.max(ts.data[0][host]), mem_max) mem_per_core = mem_max / (1024. * 1024. * 1024. * float(ts.wayness)) print ts.j.id, ': ', mem_per_core, ts.wayness, ld.threads if (int(ts.wayness) * int(ld.threads)) > 16: print ts.j.id, 'used more than one thread per core!' if (int(ts.wayness)*int(ld.threads)) <= 16 and \ (int(ts.wayness)*int(ld.threads)) > 0 : return [mem_per_core] else: return []
def compute_imbalance(file, k1, k2, thresh, lariat_dict): try: ts = tspl.TSPLBase(file, k1, k2) except tspl.TSPLException as e: return except EOFError as e: print 'End of file found reading: ' + file return ignore_qs = ['gpu', 'gpudev', 'vis', 'visdev'] if not tspl_utils.checkjob(ts, 3600, 16, ignore_qs): # 1 hour, 16way only return elif ts.numhosts < 2: # At least 2 hosts print ts.j.id + ': 1 host' return if lariat_dict == None: ld = lariat_utils.LariatData(ts.j.id, end_epoch=ts.j.end_time, daysback=3, directory=analyze_conf.lariat_path) else: ld = lariat_utils.LariatData(ts.j.id, olddata=lariat_dict) if ld.wayness == -1: print 'Unknown wayness: ', ts.j.id return elif ld.wayness != ts.wayness: print 'Lariat and TACC Stats disagree about wayness. Skipping: ', ts.j.id return tmid = (ts.t[:-1] + ts.t[1:]) / 2.0 rng = range(1, len(tmid)) # Throw out first and last tmid = tmid[rng] for h in ts.data[0].keys(): host_data = ts.data[0][h] maxval = numpy.zeros(len(rng)) minval = numpy.ones(len(rng)) * 1e100 rate = [] for v in host_data: rate.append(numpy.diff(v)[rng] / numpy.diff(ts.t)[rng]) maxval = numpy.maximum(maxval, rate[-1]) minval = numpy.minimum(minval, rate[-1]) vals = [] mean = [] std = [] for j in range(len(rng)): vals.append([]) for v in rate: vals[j].append(v[j]) mean.append(scipy.stats.tmean(vals[j])) std.append(scipy.stats.tstd(vals[j])) ratio = numpy.divide(std, mean) var = scipy.stats.tmean(ratio) if abs(var) > thresh: print ts.j.id + ': ' + str(var) return file
def compute_ratio(file, lariat_dict=None): try: ts = tspl.TSPLSum(file, [ 'intel_snb_imc', 'intel_snb_imc', 'intel_snb', 'intel_snb', 'intel_snb', 'intel_snb', 'intel_snb' ], [ 'CAS_READS', 'CAS_WRITES', 'LOAD_L1D_ALL', 'SIMD_D_256', 'SSE_D_ALL', 'STALLS', 'CLOCKS_UNHALTED_CORE' ]) except tspl.TSPLException as e: return ignore_qs = ['gpu', 'gpudev', 'vis', 'visdev'] if not tspl_utils.checkjob(ts, 3600., range(1, 33), ignore_qs): return tmid = (ts.t[:-1] + ts.t[1:]) / 2.0 if lariat_dict == None: ld = lariat_utils.LariatData(ts.j.id, end_epoch=ts.j.end_time, daysback=3, directory=analyze_conf.lariat_path) else: ld = lariat_utils.LariatData(ts.j.id, olddata=lariat_dict) if ld.exc == 'unknown' or ld.wayness != ts.wayness: # try loading older lariat ld = lariat_utils.LariatData(ts.j.id, end_epoch=ts.j.end_time, daysback=3, directory=analyze_conf.lariat_path, olddata=ld.ld) if ld.exc == 'unknown' or ld.wayness != ts.wayness: # Still nothing; return return read_rate = numpy.zeros_like(tmid) write_rate = numpy.zeros_like(tmid) l1_rate = numpy.zeros_like(tmid) avx_rate = numpy.zeros_like(tmid) sse_rate = numpy.zeros_like(tmid) stall_rate = numpy.zeros_like(tmid) clock_rate = numpy.zeros_like(tmid) for host in ts.j.hosts.keys(): read_rate += numpy.diff(ts.assemble([0], host, 0)) / numpy.diff(ts.t) write_rate += numpy.diff(ts.assemble([1], host, 0)) / numpy.diff(ts.t) l1_rate += numpy.diff(ts.assemble([2], host, 0)) / numpy.diff(ts.t) avx_rate += numpy.diff(ts.assemble([3], host, 0)) / numpy.diff(ts.t) sse_rate += numpy.diff(ts.assemble([4], host, 0)) / numpy.diff(ts.t) stall_rate += numpy.diff(ts.assemble([5], host, 0)) / numpy.diff(ts.t) clock_rate += numpy.diff(ts.assemble([6], host, 0)) / numpy.diff(ts.t) if float(ts.numhosts * int(ts.wayness) * int(ld.threads)) == 0: print 'No tasks in', ts.j.id, ' skipping' return read_rate /= float(ts.numhosts * int(ts.wayness) * int(ld.threads)) write_rate /= float(ts.numhosts * int(ts.wayness) * int(ld.threads)) l1_rate /= float(ts.numhosts * int(ts.wayness) * int(ld.threads)) avx_rate /= float(ts.numhosts * int(ts.wayness) * int(ld.threads)) sse_rate /= float(ts.numhosts * int(ts.wayness) * int(ld.threads)) stall_rate /= float(ts.numhosts * int(ts.wayness) * int(ld.threads)) clock_rate /= float(ts.numhosts * int(ts.wayness) * int(ld.threads)) try: data_ratio = (read_rate + write_rate) / l1_rate except RuntimeWarning: print 'Division by zero, skipping:', ts.j.id return flops = avx_rate + sse_rate try: flops_ratio = (flops - numpy.min(flops)) / (numpy.max(flops) - numpy.min(flops)) except RuntimeWarning: print 'Division by zero, skipping:', ts.j.id return try: stall_ratio = stall_rate / clock_rate except RuntimeWarning: print 'Division by zero, skipping:', ts.j.id return mean_data_ratio = numpy.mean(data_ratio) mean_stall_ratio = numpy.mean(stall_ratio) mean_mem_rate = numpy.mean(read_rate + write_rate) * 64.0 if mean_stall_ratio > 1.: return elif mean_mem_rate > 75. * 1000000000. / 16.: return ename = ld.exc.split('/')[-1] ename = ld.comp_name(ename, ld.equiv_patterns) ## if mean_mem_rate > 2e9: # Put a print in here and investigate bad jobs ## return return (ts.j.id, ts.su, ename, mean_data_ratio, mean_stall_ratio, mean_mem_rate)
def main(): parser = argparse.ArgumentParser(description='Look for imbalance between' 'hosts for a pair of keys') parser.add_argument('filearg', help='File, directory, or quoted' ' glob pattern', nargs='?', default='jobs') parser.add_argument('-p', help='Set number of processes', nargs=1, type=int, default=[1]) n = parser.parse_args() filelist = tspl_utils.getfilelist(n.filearg) procs = min(len(filelist), n.p[0]) job = pickle.load(open(filelist[0])) jid = job.id epoch = job.end_time ld = lariat_utils.LariatData(jid, end_epoch=epoch, daysback=3, directory=analyze_conf.lariat_path) if procs < 1: print 'Must have at least one file' exit(1) pool = multiprocessing.Pool(processes=procs) partial_work = functools.partial(do_work, mintime=3600., wayness=16, lariat_dict=ld.ld) results = pool.map(partial_work, filelist) print len(results) sus = {} for (f_stall, mem_rate, cpi, ename, jid, user, su) in results: if f_stall is None: continue if ename in sus: sus[ename] += su else: sus[ename] = su d = collections.Counter(sus) enames = zip(*d.most_common(50))[0] for k, v in d.most_common(50): print k, v for (f_stall, mem_rate, cpi, ename, jid, user, su) in results: if (f_stall is None) or (not ename in enames): continue cpec = 1. / (1. - f_stall) if cpi > 1.0: # and cpec > 2.0: print jid, ename, cpi, cpec, user, sus[ename]
def do_compute(file): try: ts = tspl.TSPLSum(file, [ 'intel_snb_imc', 'intel_snb_imc', 'intel_snb', 'intel_snb', 'intel_snb', 'intel_snb', 'intel_snb' ], [ 'CAS_READS', 'CAS_WRITES', 'LOAD_L1D_ALL', 'SIMD_D_256', 'SSE_D_ALL', 'STALLS', 'CLOCKS_UNHALTED_CORE' ]) except tspl.TSPLException as e: return if not tspl_utils.checkjob(ts, 0, 16): return elif ts.numhosts < 2: print ts.j.id + ': 1 host' return ignore_qs = ['gpu', 'gpudev', 'vis', 'visdev'] if not tspl_utils.checkjob(ts, 3600., range(1, 33), ignore_qs): return ld = lariat_utils.LariatData(ts.j.id, ts.j.end_time, '/scratch/projects/lariatData') if ld.exc == 'unknown': return tmid = (ts.t[:-1] + ts.t[1:]) / 2.0 read_rate = numpy.zeros_like(tmid) write_rate = numpy.zeros_like(tmid) l1_rate = numpy.zeros_like(tmid) avx_rate = numpy.zeros_like(tmid) sse_rate = numpy.zeros_like(tmid) stall_rate = numpy.zeros_like(tmid) clock_rate = numpy.zeros_like(tmid) for host in ts.j.hosts.keys(): read_rate += numpy.diff(ts.assemble([0], host, 0)) / numpy.diff(ts.t) write_rate += numpy.diff(ts.assemble([1], host, 0)) / numpy.diff(ts.t) l1_rate += numpy.diff(ts.assemble([2], host, 0)) / numpy.diff(ts.t) avx_rate += numpy.diff(ts.assemble([3], host, 0)) / numpy.diff(ts.t) sse_rate += numpy.diff(ts.assemble([4], host, 0)) / numpy.diff(ts.t) stall_rate += numpy.diff(ts.assemble([5], host, 0)) / numpy.diff(ts.t) clock_rate += numpy.diff(ts.assemble([6], host, 0)) / numpy.diff(ts.t) read_rate /= ts.numhosts write_rate /= ts.numhosts l1_rate /= ts.numhosts avx_rate /= ts.numhosts sse_rate /= ts.numhosts stall_rate /= ts.numhosts clock_rate /= ts.numhosts data_ratio = (read_rate + write_rate) / l1_rate flops = avx_rate + sse_rate flops_ratio = (flops - numpy.min(flops)) / (numpy.max(flops) - numpy.min(flops)) stall_ratio = stall_rate / clock_rate mean_data_ratio = numpy.mean(data_ratio) mean_stall_ratio = numpy.mean(stall_ratio) mean_flops = numpy.mean(flops) ename = ld.exc.split('/')[-1] ename = ld.comp_name(ename, ld.equiv_patterns) mean_mem_rate = numpy.mean(read_rate + write_rate) if mean_mem_rate > 2e9: # Put a print in here and investigate bad jobs return return ','.join([ ts.j.id, ts.owner, ename, str(mean_mem_rate), str(mean_stall_ratio), str(mean_data_ratio), str(mean_flops) ])
def main(): mem_rate_thresh = 0.5 * 75 * 1000000000 / 16 stall_thresh = 0.5 parser = argparse.ArgumentParser(description='Correlations') parser.add_argument('-p', help='Set number of processes', nargs=1, type=int, default=[1]) parser.add_argument('-n', help='Set number of executables to catalog', nargs=1, type=int, default=[15]) parser.add_argument('-s', help='Use SUs instead of job counts', action='store_true') parser.add_argument('filearg', help='File, directory, or quoted' ' glob pattern', nargs='?', default='jobs') n = parser.parse_args() filelist = tspl_utils.getfilelist(n.filearg) job = pickle.load(open(filelist[0])) jid = job.id epoch = job.end_time ld = lariat_utils.LariatData(jid, end_epoch=epoch, daysback=3, directory=analyze_conf.lariat_path) if n.p[0] < 1: print 'Must have at least one file' exit(1) partial_compute = functools.partial(compute_ratio, lariat_dict=ld.ld) pool = multiprocessing.Pool(processes=n.p[0]) res = pool.map(partial_compute, filelist) pool.close() pool.join() mdr = {} msr = {} mmr = {} sus = {} for tup in res: try: (jobid, su, ename, mean_data_ratio, mean_stall_ratio, mean_mem_rate) = tup except TypeError as e: continue if ename in mdr: mdr[ename] = numpy.append(mdr[ename], numpy.array([mean_data_ratio])) msr[ename] = numpy.append(msr[ename], numpy.array([mean_stall_ratio])) mmr[ename] = numpy.append(mmr[ename], numpy.array([mean_mem_rate])) sus[ename] += su else: mdr[ename] = numpy.array([mean_data_ratio]) msr[ename] = numpy.array([mean_stall_ratio]) mmr[ename] = numpy.array([mean_mem_rate]) sus[ename] = su if (mean_mem_rate <= mem_rate_thresh) and \ (mean_stall_ratio > stall_thresh) : print ename, jobid, mean_mem_rate / 1000000000, mean_stall_ratio # Find top codes by SUs top_count = {} for k in mdr.keys(): if n.s: top_count[k] = sus[k] # by sus else: top_count[k] = len(mdr[k]) # by count d = collections.Counter(top_count) mdr2 = {} msr2 = {} mmr2 = {} for k, v in d.most_common(n.n[0]): print k, v mdr2[k] = numpy.log10(mdr[k]) msr2[k] = msr[k] mmr2[k] = numpy.log10(mmr[k]) # for k in mdr.keys(): # if len(mdr[k]) < 5: # continue # mdr2[k]=mdr[k] x = [top_count[k] for k in mdr2.keys()] l = len(mdr2.keys()) y = numpy.linspace(0.10, 0.95, l) widths = numpy.interp(x, numpy.linspace(5.0, float(max(x)), l), y) fig, ax = plt.subplots(1, 1, figsize=(8, 8), dpi=80) plt.subplots_adjust(hspace=0.35, bottom=0.25) ax.boxplot(mdr2.values(), widths=widths) xtickNames = plt.setp(ax, xticklabels=mdr2.keys()) plt.setp(xtickNames, rotation=45, fontsize=8) ax.set_ylabel(r'log(DRAM BW/L1 Fill Rate)') fname = 'box_mdr' fig.savefig(fname) plt.close() markers = itertools.cycle(('o', 'x', '+', '^', 's', '8', 'p', 'h', '*', 'D', '<', '>', 'v', 'd', '.')) colors = itertools.cycle(('b', 'g', 'r', 'c', 'm', 'k', 'y')) fig, ax = plt.subplots(1, 1, figsize=(10, 8), dpi=80) for k in mdr2.keys(): ax.plot(mdr2[k], msr2[k], marker=markers.next(), markeredgecolor=colors.next(), linestyle='', markerfacecolor='None') ax.hold = True box = ax.get_position() ax.set_position([box.x0, box.y0, box.width * 0.75, box.height]) ax.legend(mdr2.keys(), bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0., numpoints=1) ax.set_xlabel('log(DRAM BW/L1 Fill Rate)') ax.set_ylabel('Stall Fraction') fname = 'msr_v_mdr' fig.savefig(fname) plt.close() markers = itertools.cycle(('o', 'x', '+', '^', 's', '8', 'p', 'h', '*', 'D', '<', '>', 'v', 'd', '.')) colors = itertools.cycle(('b', 'g', 'r', 'c', 'm', 'k', 'y')) fig, ax = plt.subplots(1, 1, figsize=(10, 8), dpi=80) for k in mdr2.keys(): ax.plot(mmr2[k], msr2[k], marker=markers.next(), markeredgecolor=colors.next(), linestyle='', markerfacecolor='None') ax.hold = True ax.plot(numpy.log10([mem_rate_thresh, mem_rate_thresh]), [ 0.95 * min(numpy.concatenate(msr2.values())), 1.05 * max(numpy.concatenate(msr2.values())) ], 'r--') print[ min(numpy.concatenate(mmr2.values())), max(numpy.concatenate(mmr2.values())) ], [stall_thresh, stall_thresh], 'r--' ax.plot([ min(numpy.concatenate(mmr2.values())), max(numpy.concatenate(mmr2.values())) ], [stall_thresh, stall_thresh], 'r--') box = ax.get_position() ax.set_position([box.x0, box.y0, box.width * 0.75, box.height]) ax.legend(mdr2.keys(), bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0., numpoints=1) ax.set_xlabel('log(DRAM BW)') ax.set_ylabel('Stall Fraction') fname = 'msr_v_mem' fig.savefig(fname) plt.close()
def main(): parser = argparse.ArgumentParser(description='Look for high meta data rate'\ ' to Lustre') parser.add_argument('-t', metavar='thresh', help='Treshold metadata rate', nargs=1, default=[100000.]) parser.add_argument('filearg', help='File, directory, or quoted' ' glob pattern', nargs='?', default='jobs') n = parser.parse_args() thresh = float(n.t[0]) print thresh filelist = tspl_utils.getfilelist(n.filearg) # k1=['llite', 'llite', 'llite', 'llite', 'llite', # 'llite', 'llite', 'llite', 'llite', 'llite', # 'llite', 'llite', 'llite', 'llite', 'llite', # 'llite', 'llite', 'llite', 'llite', 'llite', # 'llite', 'llite', 'llite', 'llite', 'llite', # 'llite'] # k2=['open','close','mmap','seek','fsync','setattr', # 'truncate','flock','getattr','statfs','alloc_inode', # 'setxattr','getxattr',' listxattr', # 'removexattr', 'inode_permission', 'readdir', # 'create','lookup','link','unlink','symlink','mkdir', # 'rmdir','mknod','rename',] k1 = [ 'llite', 'llite', 'llite', 'llite', 'llite', 'llite', 'llite', 'llite', 'llite', 'llite', 'llite', 'llite', 'llite', 'llite', 'llite', 'llite', 'llite', 'llite', 'llite', 'llite', 'llite', 'llite', 'llite', ] k2 = [ 'open', 'close', 'mmap', 'fsync', 'setattr', 'truncate', 'flock', 'getattr', 'statfs', 'alloc_inode', 'setxattr', ' listxattr', 'removexattr', 'readdir', 'create', 'lookup', 'link', 'unlink', 'symlink', 'mkdir', 'rmdir', 'mknod', 'rename', ] for file in filelist: try: ts = tspl.TSPLSum(file, k1, k2) except tspl.TSPLException as e: continue if not tspl_utils.checkjob(ts, 3600., range(1, 33)): continue tmid = (ts.t[:-1] + ts.t[1:]) / 2.0 ld = lariat_utils.LariatData(ts.j.id, ts.j.end_time, 'lariatData') meta_rate = numpy.zeros_like(tmid) for k in ts.j.hosts.keys(): meta_rate += numpy.diff(ts.assemble(range(0, len(k1)), k, 0)) / numpy.diff(ts.t) meta_rate /= float(ts.numhosts) if numpy.max(meta_rate) > thresh: title = ts.title if ld.exc != 'unknown': title += ', E: ' + ld.exc.split('/')[-1] fig, ax = plt.subplots(1, 1, figsize=(10, 8), dpi=80) plt.subplots_adjust(hspace=0.35) plt.suptitle(title) markers = ('o', 'x', '+', '^', 's', '8', 'p', 'h', '*', 'D', '<', '>', 'v', 'd', '.') colors = ('b', 'g', 'r', 'c', 'm', 'k', 'y') cnt = 0 for v in ts.data: for host in v: for vals in v[host]: rate = numpy.diff(vals) / numpy.diff(ts.t) c = colors[cnt % len(colors)] m = markers[cnt % len(markers)] # print cnt,(cnt % len(colors)), (cnt % len(markers)), k2[cnt], c, m ax.plot(tmid / 3600., rate, marker=m, markeredgecolor=c, linestyle='-', color=c, markerfacecolor='None', label=k2[cnt]) ax.hold = True cnt = cnt + 1 ax.set_ylabel('Meta Data Rate (op/s)') tspl_utils.adjust_yaxis_range(ax, 0.1) handles, labels = ax.get_legend_handles_labels() new_handles = {} for h, l in zip(handles, labels): new_handles[l] = h box = ax.get_position() ax.set_position([box.x0, box.y0, box.width * 0.9, box.height]) ax.legend(new_handles.values(), new_handles.keys(), prop={'size': 8}, bbox_to_anchor=(1.05, 1), borderaxespad=0., loc=2) fname = '_'.join(['metadata', ts.j.id, ts.owner]) fig.savefig(fname) plt.close()
def master_plot(file, mode='lines', threshold=False, output_dir='.', prefix='graph', mintime=3600, wayness=16, header='Master', lariat_dict=None, wide=False, job_stats=None): k1 = { 'amd64': [ 'amd64_core', 'amd64_core', 'amd64_sock', 'lnet', 'lnet', 'ib_sw', 'ib_sw', 'cpu' ], 'intel': [ 'intel_pmc3', 'intel_pmc3', 'intel_pmc3', 'lnet', 'lnet', 'ib_ext', 'ib_ext', 'cpu', 'mem', 'mem' ], 'intel_snb': [ 'intel_snb_imc', 'intel_snb_imc', 'intel_snb', 'lnet', 'lnet', 'ib_sw', 'ib_sw', 'cpu', 'intel_snb', 'intel_snb', 'mem', 'mem' ], } k2 = { 'amd64': [ 'SSE_FLOPS', 'DCSF', 'DRAM', 'rx_bytes', 'tx_bytes', 'rx_bytes', 'tx_bytes', 'user' ], 'intel': [ 'MEM_LOAD_RETIRED_L1D_HIT', 'FP_COMP_OPS_EXE_X87', 'INSTRUCTIONS_RETIRED', 'rx_bytes', 'tx_bytes', 'port_recv_data', 'port_xmit_data', 'user', 'MemUsed', 'AnonPages' ], 'intel_snb': [ 'CAS_READS', 'CAS_WRITES', 'LOAD_L1D_ALL', 'rx_bytes', 'tx_bytes', 'rx_bytes', 'tx_bytes', 'user', 'SSE_D_ALL', 'SIMD_D_256', 'MemUsed', 'AnonPages' ], } try: print file ts = tspl.TSPLSum(file, k1, k2, job_stats) except tspl.TSPLException as e: return ignore_qs = [] #'gpu','gpudev','vis','visdev'] if not tspl_utils.checkjob(ts, mintime, wayness, ignore_qs): return if lariat_dict == None: ld = lariat_utils.LariatData(ts.j.id, end_epoch=ts.j.end_time, daysback=3, directory=analyze_conf.lariat_path) elif lariat_dict == "pass": ld = lariat_utils.LariatData(ts.j.id) else: ld = lariat_utils.LariatData(ts.j.id, olddata=lariat_dict) wayness = ts.wayness if ld.wayness != -1 and ld.wayness < ts.wayness: wayness = ld.wayness if wide: fig, ax = plt.subplots(6, 2, figsize=(15.5, 12), dpi=110) # Make 2-d array into 1-d, and reorder so that the left side is blank ax = my_utils.flatten(ax) ax_even = ax[0:12:2] ax_odd = ax[1:12:2] ax = ax_odd + ax_even for a in ax_even: a.axis('off') else: fig, ax = plt.subplots(6, 1, figsize=(8, 12), dpi=110) if mode == 'hist': plot = plot_thist elif mode == 'percentile': plot = plot_mmm else: plot = plot_lines if ts.pmc_type == 'intel_snb': # Plot key 1 plot(ax[0], ts, [8, 9], 3600., 1e9, ylabel='Total AVX +\nSSE Ginst/s') # Plot key 2 plot(ax[1], ts, [0, 1], 3600., 1.0 / 64.0 * 1024. * 1024. * 1024., ylabel='Total Mem BW GB/s') #Plot key 3 #plot(ax[2],ts,[2],3600.,1.0/64.0*1e9, ylabel='L1 BW GB/s') plot(ax[2], ts, [10, -11], 3600., 1024.0 * 1024.0 * 1024.0, ylabel='Memory Usage GB', do_rate=False) elif ts.pmc_type == 'intel': plot(ax[0], ts, [1], 3600., 1e9, ylabel='FP Ginst/s') plot(ax[2], ts, [8, -9], 3600., 1024.0 * 1024.0 * 1024.0, ylabel='Memory Usage GB', do_rate=False) else: #Fix this to support the old amd plots print ts.pmc_type + ' not supported' return # Plot lnet sum rate plot(ax[3], ts, [3, 4], 3600., 1024.**2, ylabel='Total lnet MB/s') # Plot remaining IB sum rate if ts.pmc_type == 'intel_snb': plot(ax[4], ts, [5, 6, -3, -4], 3600., 1024.**2, ylabel='Total (ib_sw-lnet) MB/s') elif ts.pmc_type == 'intel': plot(ax[4], ts, [5, 6, -3, -4], 3600., 1024.**2, ylabel='Total (ib_ext-lnet) MB/s') #Plot CPU user time plot(ax[5], ts, [7], 3600., wayness * 100., xlabel='Time (hr)', ylabel='Total cpu user\nfraction') print ts.j.id + ': ' plt.subplots_adjust(hspace=0.35) if wide: left_text = header + '\n' + my_utils.summary_text(ld, ts) text_len = len(left_text.split('\n')) fontsize = ax[0].yaxis.label.get_size() linespacing = 1.2 fontrate = float(fontsize * linespacing) / 72. / 15.5 yloc = .8 - fontrate * (text_len - 1 ) # this doesn't quite work. fontrate is too # small by a small amount plt.figtext(.05, yloc, left_text, linespacing=linespacing) fname = '_'.join([prefix, ts.j.id, ts.owner, 'wide_master']) elif header != None: title = header + '\n' + ts.title if threshold: title += ', V: %(v)-6.1f' % {'v': threshold} title += '\n' + ld.title() plt.suptitle(title) fname = '_'.join([prefix, ts.j.id, ts.owner, 'master']) else: fname = '_'.join([prefix, ts.j.id, ts.owner, 'master']) if mode == 'hist': fname += '_hist' elif mode == 'percentile': fname += '_perc' plt.close() return fig, fname
def main(): parser = argparse.ArgumentParser(description='Look for imbalance between' 'hosts for a pair of keys') parser.add_argument('threshold', help='Treshold ratio for std dev:mean', nargs='?', default=0.25) parser.add_argument('key1', help='First key', nargs='?', default='amd64_core') parser.add_argument('key2', help='Second key', nargs='?', default='SSE_FLOPS') parser.add_argument('filearg', help='File, directory, or quoted' ' glob pattern', nargs='?', default='jobs') parser.add_argument('-p', help='Set number of processes', nargs=1, type=int, default=[1]) parser.add_argument('-o', help='Output directory', nargs=1, type=str, default=['.'], metavar='output_dir') # parser.add_argument('-f', help='Set full mode', action='store_true') # parser.add_argument('-n', help='Disable plots', action='store_true') n = parser.parse_args() filelist = tspl_utils.getfilelist(n.filearg) procs = min(len(filelist), n.p[0]) job = pickle.load(open(filelist[0])) jid = job.id epoch = job.end_time ld = lariat_utils.LariatData(jid, end_epoch=epoch, daysback=3, directory=analyze_conf.lariat_path) if procs < 1: print 'Must have at least one file' exit(1) pool = multiprocessing.Pool(processes=procs) partial_imbal = functools.partial(compute_imbalance, k1=[n.key1], k2=[n.key2], thresh=float(n.threshold), lariat_dict=ld.ld) res = pool.map(partial_imbal, filelist) pool.close() pool.join() flagged_jobs = [r for r in res if r] print flagged_jobs print len(flagged_jobs) if len(flagged_jobs) != 0: pool = multiprocessing.Pool(processes=min(n.p[0], len(flagged_jobs))) pool.map(do_mp, zip(flagged_jobs, [n.o[0] for x in flagged_jobs])) # Pool.starmap should exist.... pool.close() pool.join()
def main(): parser = argparse.ArgumentParser(description='Look for imbalance between' 'hosts for a pair of keys') parser.add_argument('filearg', help='File, directory, or quoted' ' glob pattern', nargs='?',default='jobs') parser.add_argument('-p', help='Set number of processes', nargs=1, type=int, default=[1]) n=parser.parse_args() filelist=tspl_utils.getfilelist(n.filearg) procs = min(len(filelist),n.p[0]) job=pickle.load(open(filelist[0])) jid=job.id epoch=job.end_time ld=lariat_utils.LariatData(jid,end_epoch=epoch,daysback=3,directory=analyze_conf.lariat_path) if procs < 1: print 'Must have at least one file' exit(1) pool = multiprocessing.Pool(processes=procs) partial_work=functools.partial(do_work,mintime=3600.,wayness=16,lariat_dict=ld.ld) results=pool.map(partial_work,filelist) fig1,ax1=plt.subplots(1,1,figsize=(20,8),dpi=80) fig2,ax2=plt.subplots(1,1,figsize=(20,8),dpi=80) maxx=0. for state in [ True, False ]: stalls=[] misses=[] cpis=[] enames=[] for (s,m,cpi,ename,flag) in results: if (s != None and m > 0. and m < 1.0 and flag==state): stalls.extend([s]) misses.extend([m]) cpis.extend([cpi]) enames.extend([ename]) markers = itertools.cycle(('o','x','+','^','s','8','p', 'h','*','D','<','>','v','d','.')) colors = itertools.cycle(('b','g','r','c','m','k','y')) fmt={} for e in enames: if not e in fmt: fmt[e]=markers.next()+colors.next() for (s,c,e) in zip(stalls,cpis,enames): # ax1.plot(numpy.log10(1.-(1.-s)),numpy.log10(c), maxx=max(maxx,1./(1.-s)) ax1.plot((1./(1.-s)),(c), marker=fmt[e][0], markeredgecolor=fmt[e][1], linestyle='', markerfacecolor='None', label=e) ax1.hold=True ax2.plot((1./(1.-s)),(c), marker=fmt[e][0], markeredgecolor=fmt[e][1], linestyle='', markerfacecolor='None', label=e) ax2.hold=True #ax.plot(numpy.log10(stalls),numpy.log10(cpis),fmt) #ax.plot(numpy.log10(1.0/(1.0-numpy.array(stalls))),numpy.log10(cpis),fmt) ax1.set_xscale('log') ax1.set_xlim(left=0.95,right=1.05*maxx) ax1.set_yscale('log') box = ax1.get_position() ax1.set_position([box.x0, box.y0, box.width * 0.45, box.height]) box = ax2.get_position() ax2.set_position([box.x0, box.y0, box.width * 0.45, box.height]) handles=[] labels=[] for h,l in zip(*ax1.get_legend_handles_labels()): if l in labels: continue else: handles.extend([h]) labels.extend([l]) ax1.legend(handles,labels,bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0., numpoints=1,ncol=4) ax1.set_xlabel('log(Cycles per Execution Cycle)') ax1.set_ylabel('log(CPI)') handles=[] labels=[] for h,l in zip(*ax2.get_legend_handles_labels()): if l in labels: continue else: handles.extend([h]) labels.extend([l]) ax2.legend(handles,labels,bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0., numpoints=1,ncol=4) ax2.set_xlabel('Cycles per Execution Cycle') ax2.set_ylabel('CPI') fname='miss_v_stall_log' fig1.savefig(fname) fname='miss_v_stall' fig2.savefig(fname) plt.close()
def main(): parser = argparse.ArgumentParser( description='Plot important stats for jobs') parser.add_argument('-m', help='Plot mode: lines, hist, percentile', nargs=1, type=str, default=['lines'], metavar='mode') parser.add_argument('-o', help='Output directory', nargs=1, type=str, default=['.'], metavar='output_dir') parser.add_argument('filearg', help='File, directory, or quoted' ' glob pattern', nargs='?', default='jobs') parser.add_argument('-p', help='Set number of processes', nargs=1, type=int, default=[1]) parser.add_argument('-s', help='Set minimum time in seconds', nargs=1, type=int, default=[3600]) parser.add_argument('-w', help='Set wide plot format', action='store_true') n = parser.parse_args() filelist = tspl_utils.getfilelist(n.filearg) procs = min(len(filelist), n.p[0]) job = pickle.load(open(filelist[0])) jid = job.id epoch = job.end_time ld = lariat_utils.LariatData(jid, end_epoch=epoch, daysback=3, directory=analyze_conf.lariat_path) if procs < 1: print 'Must have at least one file' exit(1) pool = multiprocessing.Pool(processes=procs) partial_master = functools.partial(mp_wrapper, mode=n.m[0], threshold=False, output_dir=n.o[0], prefix='graph', mintime=n.s[0], wayness=[x + 1 for x in range(16)], lariat_dict=ld.ld, wide=n.w) pool.map(partial_master, filelist) pool.close() pool.join()