def update_jct(workload, update_global=False): # type: (Workload, bool) -> None """Parse and update JCT value of a completed workload""" if workload.proc is None or workload.proc.returncode != 0: raise ValueError(f'Workload {workload.name} not started or terminated in error') jct = parse_output_float(workload.output_file, r'^JCT: ([0-9.]+) .*') workload.geometry.jct = jct if update_global: WTL.from_name(workload.name).add_geometry(workload.rcfg, workload.executor, ResourceGeometry(jct=jct))
def do_mem(logdir, network, batch_size): """Do basic JCT on workload""" batch_num = 20 if network == "speech": batch_num = 5 logger.info(f'Measuring memory for {network}_{batch_size} for {batch_num} iter') ex = "salus" if FLAGS.use_salus else "tf" final_dst = logdir / ex / WTL.from_name(network).canonical_name(RunConfig(batch_size, batch_num, None)) with atomic_directory(final_dst) as outputdir: if not FLAGS.use_salus: logger.info(' Running on TF') wl = WTL.create(network, batch_size, batch_num, Executor.TF) wl.env['TF_CPP_MIN_VLOG_LEVEL'] = '1' wl.env['TF_CPP_MIN_LOG_LEVEL'] = '' run_tf(outputdir, wl) # filter and move file to a more convinent name for f in pathlib.Path(outputdir).iterdir(): with f.with_name('alloc.output').open('w') as file: grep = execute(['egrep', r"] (\+|-)", f.name], stdout=file, cwd=str(f.parent)) grep.wait() f.unlink() break else: scfg = maybe_forced_preset(presets.AllocProf) scfg.logconf = "memop" scfg.output_dir = outputdir server = SalusServer(scfg) with server.run(): logger.info(' Running on Salus') WTL.block_run(network, batch_size, batch_num, Executor.Salus, outputdir / 'rpc.output') return final_dst
def gen_workload_list(selection): # type: (str) -> Iterable[Tuple[WTL, RunConfig]] """Select workloads based on commandline""" if not selection: blacklist = ['speech', 'seq2seq', 'mnistlg', 'mnistsf', 'mnistcv'] names = ( (v, bs) for k, v in WTL.known_workloads.items() for bs in v.available_batch_sizes() if k not in blacklist ) else: names = [] for cname in unique((cname for cname in selection.split(',')), stable=True): if '_' not in cname: raise UsageError(f"Not a canonical name: {cname}") name, bs = cname.split('_', 1) bs = try_with_default(int, bs, ValueError)(bs) names.append((WTL.from_name(name), bs)) # Find all available batch_num with JCT and mem data return ( (wtl, RunConfig(bs, bn, None)) for wtl, bs in names for bn in wtl.available_batch_nums(bs) )
def do_jct(logdir, network, batch_size): """Do basic JCT on workload""" batch_num = 20 final_dst = logdir / WTL.from_name(network).canonical_name(RunConfig(batch_size, batch_num, None)) with atomic_directory(final_dst) as outputdir: logger.info(f'Measuring basic JCT for {batch_num} iterations') mps_name = '-mps' if FLAGS.is_mps else '' if not (final_dst/'gpu{}.output'.format(mps_name)).exists() or not FLAGS.resume: logger.info(' Running on TF') WTL.block_run(network, batch_size, batch_num, Executor.TF, outputdir / 'gpu{}.output'.format(mps_name)) if FLAGS.do_tfdist: if not (final_dst/'tfdist{}.output'.format(mps_name)).exists() or not FLAGS.resume: with TFDistServer().run(): logger.info(' Running on TFDist') WTL.block_run(network, batch_size, batch_num, Executor.TFDist, outputdir / 'tfdist{}.output'.format(mps_name)) if FLAGS.is_mps: logger.info(' Skipping Salus jct when MPS is on') return final_dst if not (final_dst / 'rpc.output').exists() or not FLAGS.resume: scfg = maybe_forced_preset(presets.MostEfficient) scfg.output_dir = outputdir server = SalusServer(scfg) with server.run(): logger.info(' Warming up Salus') # always use 20 batch num when warming up WTL.block_run(network, batch_size, 20, Executor.Salus, outputdir / 'rpc-warm.output') logger.info(' Running on Salus') WTL.block_run(network, batch_size, batch_num, Executor.Salus, outputdir / 'rpc.output') return final_dst
def getbs(name): if '_' in name: name, bs = name.split('_') bs = int(bs) return [(name, bs)] else: bss = WTL.from_name(name).available_batch_sizes() names = [name] * len(bss) return zip(names, bss)
def getbs(name): if '_' in name: name, bs = name.split('_') bs = int(bs) return [(name, bs)] else: # using a single batch size is enough bss = WTL.from_name(name).available_batch_sizes() return [(name, list(bss)[0])]
def expandbs(name): if '_' in name: name, bs = name.split('_') return [(name, int(bs))] else: avail = WTL.from_name(name).available_batch_sizes() if batch_size is None: bss = avail else: bss = [bs for bs in batch_size if bs in avail] return zip([name] * len(bss), bss)
def find_geometry(w, field): """ :type w: Workload :type field: str """ if w.geometry[field] is not None: return w.geometry[field] # check for another bn for bn in w.wtl.available_batch_nums(w.batch_size): g = WTL.from_name(w.name).geometry(RunConfig(w.batch_size, bn, None), w.executor) if g[field] is not None: w.geometry[field] = g[field] return g[field] return None
def select_workloads(argv): # type: (Iterable[str]) -> Iterable[(str, TBatchSize)] """Select workloads based on commandline""" if not argv: names = WTL.known_workloads.keys() else: names = unique(( name for piece in argv for name in piece.split(',') ), stable=True) # TODO: return directly WTL instances return [(name, batch_size) for name in names for batch_size in WTL.from_name(name).available_batch_sizes()]
def do_mem(logdir, network, batch_size): """Do basic JCT on workload""" batch_num = 10 logger.info(f'Saving model checkpoint for {network}_{batch_size} for {batch_num} iter') final_dst = logdir / WTL.from_name(network).canonical_name(RunConfig(batch_size, batch_num, None)) with atomic_directory(final_dst) as outputdir: logger.info(' Running on TF') wl = WTL.create(network, batch_size, batch_num, Executor.TF) wl.env['SALUS_SAVE_MODEL'] = '1' model_dir = pathlib.Path('~/../symbiotic/peifeng/tf_cnn_benchmarks_models/legacy_checkpoint_models') model_dir = model_dir.expanduser().resolve() wl.env['SALUS_TFBENCH_EVAL_MODEL_DIR'] = str(model_dir) run_tf(outputdir, wl) return final_dst
def do_mem(logdir, network, batch_size): """Do basic JCT on workload""" batch_num = 10 logger.info(f'Saving model checkpoint for {network}_{batch_size} for {batch_num} iter') final_dst = logdir / 'tf' / WTL.from_name(network).canonical_name(RunConfig(batch_size, batch_num, None)) with atomic_directory(final_dst) as outputdir: logger.info(' Running on TF') wl = WTL.create(network, batch_size, batch_num, Executor.TF) wl.env['SALUS_SAVE_MODEL'] = '1' run_tf(outputdir, wl) # filter and move file to a more convinent name for f in pathlib.Path(outputdir).iterdir(): with f.with_name('alloc.output').open('w') as file: grep = execute(['egrep', r"] (\+|-)", f.name], stdout=file, cwd=str(f.parent)) grep.wait() f.unlink() break return final_dst
def main(argv): # type: (Sequence[str]) -> None scfg = maybe_forced_preset(presets.MostEfficient) scfg.scheduler = 'pack' scfg.disable_adc = True if argv: run_seq(scfg.copy(output_dir=FLAGS.save_dir), *parse_actions_from_cmd(argv)) return wtl = WTL.from_name('alexnet') rcfg = RunConfig(25, 2726, None) # check if we have reference JCT reference_jct = wtl.geometry(rcfg, Executor.Salus).jct if reference_jct is None: start_from = 1 logger.warning(f"No reference JCT data available for `{wtl.canonical_name(rcfg)}'") else: start_from = 2 report(1, reference_jct, 1) logger.info(f'Will stop when JCT degratation larger than {FLAGS.break_when}') for concurrent in range(start_from, FLAGS.uplimit): # run them at once logger.info(f'Runing {concurrent} workloads together') workloads = [wtl.create_from_rcfg(rcfg) for _ in range(concurrent)] run_seq(scfg.copy(output_dir=FLAGS.save_dir / f"{concurrent}"), *workloads) # calculate average jct for w in workloads: update_jct(w) jcts = [w.geometry.jct for w in workloads] avgjct = np.mean(jcts) # type: float ratio = avgjct / reference_jct report(concurrent, avgjct, ratio) if ratio > FLAGS.break_when: break