def main(argv): scfg = maybe_forced_preset(presets.AllocProf) if argv: run_seq(scfg.copy(output_dir=FLAGS.save_dir), *parse_actions_from_cmd(argv)) return run_seq(scfg.copy(output_dir=FLAGS.save_dir), WTL.create("alexnet", 25, 200), WTL.create("alexnet", 25, 200), WTL.create("alexnet", 25, 200), WTL.create("alexnet", 25, 200), WTL.create("alexnet", 25, 200), ) run_tf(FLAGS.save_dir, WTL.create("alexnet", 25, 200, executor=Executor.TF), Pause.Wait, WTL.create("alexnet", 25, 200, executor=Executor.TF), Pause.Wait, WTL.create("alexnet", 25, 200, executor=Executor.TF), Pause.Wait, WTL.create("alexnet", 25, 200, executor=Executor.TF), Pause.Wait, WTL.create("alexnet", 25, 200, executor=Executor.TF), )
def main(argv): scfg = maybe_forced_preset(presets.Profiling) scfg.scheduler = 'preempt' scfg.disable_wc = True if argv: run_seq(scfg.copy(output_dir=FLAGS.save_dir), *parse_actions_from_cmd(argv)) return run_seq(scfg.copy(output_dir=FLAGS.save_dir), WTL.create("inception3", 25, 1298), Pause(60), WTL.create("alexnet", 100, 508), ) if not FLAGS.with_ref: return # we also need reference data run_seq(presets.MostEfficient(output_dir=FLAGS.save_dir / 'reference'), WTL.create("alexnet", 100, 508), Pause.Wait, WTL.create("alexnet", 100, 508, executor=Executor.TF), )
def single(scfg): run_seq(scfg.copy(output_dir=FLAGS.save_dir), WTL.create("inception3", 100, 165), Pause.Manual, WTL.create("resnet50", 50, 798), Pause.Manual, WTL.create("resnet152", 75, 19))
def test(): scfg = maybe_forced_preset(presets.MostEfficient) # BUG: seems we must run a single job first otherwise it will hang run_seq(scfg.copy(output_dir=FLAGS.save_dir/'test'), WTL.create("alexnet", 25, 20, executor=Executor.TF), Pause.Wait, WTL.create("alexnet", 25, 20))
def case2(): """Use OpTracing to see if each iteration is exclusive""" scfg = maybe_forced_preset(presets.OpTracing) scfg.logconf = 'memop' run_seq(scfg.copy(output_dir=FLAGS.save_dir), WTL.create("inception3", 100, 165), WTL.create("resnet50", 50, 798), WTL.create("resnet152", 75, 19))
def case2(): scfg = maybe_forced_preset(presets.AllocProf) scfg.env['SALUS_DISABLE_LANEMGR'] = '1' run_seq(scfg.copy(output_dir=FLAGS.save_dir/'case2'), WTL.create("inception3", 100, 20), WTL.create("inception3", 100, 20) )
def case1(): scfg = maybe_forced_preset(presets.MostEfficient) run_tf(FLAGS.save_dir/'case1', WTL.create("inception3eval", 1, 1000, executor=Executor.TF)) run_seq(scfg.copy(output_dir=FLAGS.save_dir/'case1'), WTL.create("inception3eval", 1, 1000))
def case2(): scfg = maybe_forced_preset(presets.MostEfficient) # BUG: seems we must run a single job first otherwise it will hang run_seq(scfg.copy(output_dir=FLAGS.save_dir/'case2'), WTL.create("super_res", 128, 20, executor=Executor.TF), Pause.Wait, WTL.create("super_res", 128, 20))
def main(argv): scfg = maybe_forced_preset(presets.OpTracing) scfg.scheduler = 'pack' # Firstly run concurrently on salus run_seq(scfg.copy(output_dir=FLAGS.save_dir / "salus"), WTL.create("resnet101", 50, 47), WTL.create("resnet101", 50, 47), )
def case2(): scfg = maybe_forced_preset(presets.OpTracing) scfg.logconf = 'memop' # BUG: seems we must run a single job first otherwise it will hang run_seq(scfg.copy(output_dir=FLAGS.save_dir/'case2'), WTL.create("alexnet", 25, 10), Pause.Wait, WTL.create("alexnet", 25, 50), WTL.create("alexnet", 25, 50))
def case1(): scfg = maybe_forced_preset(presets.OpTracing) scfg.logconf = 'memop' # BUG: seems we must run a single job first otherwise it will hang run_seq(scfg.copy(output_dir=FLAGS.save_dir/'case1'), WTL.create("inception3", 100, 20), Pause.Wait, WTL.create("inception3", 100, 20), WTL.create("resnet50", 50, 20))
def case2(): """Inception3_100 is missing some dealloc log entry""" scfg = maybe_forced_preset(presets.Debugging) run_seq(scfg.copy(output_dir=FLAGS.save_dir), WTL.create("inception3", 100, 10), Pause.Wait, WTL.create("inception3", 50, 10), Pause.Wait, WTL.create("inception3", 25, 10), Pause.Wait )
def main(argv): scfg = maybe_forced_preset(presets.Profiling) if argv: run_seq(scfg.copy(output_dir=FLAGS.save_dir), *parse_actions_from_cmd(argv)) return run_seq(scfg.copy(output_dir=FLAGS.save_dir), WTL.create("resnet50", 50, 265), WTL.create("resnet50", 50, 265), )
def case2(): # Run on TF wl = WTL.create("super_res", 128, 20, executor=Executor.TF) wl.env['TF_CPP_MIN_VLOG_LEVEL'] = '2' wl.env['TF_CPP_MIN_LOG_LEVEL'] = '' run_tf(FLAGS.save_dir/'case2'/'tf', wl) scfg = maybe_forced_preset(presets.OpTracing) scfg.logconf = 'memop' run_seq(scfg.copy(output_dir=FLAGS.save_dir/'case2'/'salus'), WTL.create("super_res", 128, 20))
def case3(): """With specially compiled salus, no restriction for how iteration runs, i.e. multiple iter can run together, to collect mem data and fragmentation """ scfg = maybe_forced_preset(presets.AllocProf) scfg.env['SALUS_DISABLE_LANEMGR'] = '1' run_seq(scfg.copy(output_dir=FLAGS.save_dir/'case3'), WTL.create("inception3", 25, 20), WTL.create("inception3", 25, 20) )
def case3(): scfg = maybe_forced_preset(presets.MostEfficient) scfg.scheduler = 'fair' scfg.env['SALUS_DISABLE_LANEMGR'] = '1' run_seq(scfg.copy(output_dir=FLAGS.save_dir/'case3'), WTL.create("inception3eval", 50, 250), Pause(10), WTL.create("inception3eval", 50, 250), Pause(10), WTL.create("inception3eval", 50, 250), )
def case4(): scfg = maybe_forced_preset(presets.MostEfficient) # BUG: seems we must run a single job first otherwise it will hang run_seq(scfg.copy(output_dir=FLAGS.save_dir/'case4'/'seq2'), WTL.create("inception3", 100, 100), Pause.Wait, WTL.create("resnet50", 50, 100)) run_seq(scfg.copy(output_dir=FLAGS.save_dir/'case3'/'par2'), WTL.create("inception3", 100, 100), WTL.create("resnet50", 50, 100))
def do_measure(scfg, name, batch_sizes): batch_num = 100 # batch_sizes = [1, 2, 4, 8, 16, 32] # batch_sizes = [1024, 1536, 2048, 4096] for bs in batch_sizes: wl = WTL.create(name, bs, batch_num, executor=Executor.Salus) set_env(wl) run_seq(scfg.copy(output_dir=FLAGS.save_dir / "salus"), wl) wl = WTL.create(name, bs, batch_num, executor=Executor.TF) set_env(wl) run_seq(scfg.copy(output_dir=FLAGS.save_dir / "tf"), wl)
def tfmps2(argv): # type: (Sequence[str]) -> None name = "alexneteval" if len(argv) > 1: name = argv[0] batch_sizes = [int(v) for v in argv[1:]] if not batch_sizes: batch_sizes = [1, 2, 4, 8] batch_num = 300 # batch_sizes = [1, 2, 4, 8, 16, 32] # batch_sizes = [1024, 1536, 2048, 4096] for idx, bs in enumerate(batch_sizes): with tempfile.TemporaryDirectory() as td: # create a background training job, the batch number has no effect here, # only used to distinguish different runs trainWl = WTL.create('inception4', 50, 100 + idx, executor=Executor.TF) # make sure it runs long enough trainWl.env['SALUS_ITER_SECONDS'] = '300' trainWl.extra_args += ['--min_mem'] # create a pipe to signal trainWl pipetrain = str(pathlib.Path(td).joinpath('fifotrain')) os.mkfifo(pipetrain) trainWl.env['SALUS_WAIT_FOR_SIGNAL'] = pipetrain # create the foreground inference job wl = WTL.create(name, bs, batch_num, executor=Executor.TF) set_env(wl) wl.env['SALUS_ITER_SECONDS'] = '150' wl.extra_args += ['--min_mem'] pipe = str(pathlib.Path(td).joinpath('fifo')) os.mkfifo(pipe) wl.env['SALUS_WAIT_FOR_SIGNAL'] = pipe run_tf(FLAGS.save_dir / "tfmps2" / (name + "-inception4"), wl, # start the foreground job Pause(20), trainWl, # start the background job # wait for both jobs to be ready RunFn(lambda *args, **kwargs: wait_on_pipe(pipetrain)), RunFn(lambda *args, **kwargs: wait_on_pipe(pipe)), # start train job RunFn(lambda *args, **kwargs: release_on_pipe(pipetrain)), # wait 10 seconds Pause(10), # release inference job RunFn(lambda *args, **kwargs: release_on_pipe(pipe)), # run_seq automatically join all jobs at the end of the sequence )
def case2(): scfg = maybe_forced_preset(presets.MostEfficient) scfg.scheduler = 'pack' scfg.logconf = 'log' scfg.env['SALUS_DISABLE_SHARED_LANE'] = '1' run_seq(scfg.copy(output_dir=FLAGS.save_dir/'case2'), WTL.create("inception3eval", 50, 250), Pause(10), WTL.create("inception3eval", 50, 250), Pause(10), WTL.create("inception3eval", 50, 250), )
def case3(argv): model, bs, bn = 'resnet50', 50, 500 name = inspect.currentframe().f_code.co_name # first run one along to get JCT run_tfdist(FLAGS.save_dir/name, WTL.create(model, bs, bn, executor=Executor.TFDist)) # create 300 vae scfg = maybe_forced_preset(presets.MostEfficient) scfg.scheduler = 'pack' wls = [WTL.create(model, bs, bn) for _ in range(300)] run_seq(scfg.copy(output_dir=FLAGS.save_dir/name), *wls)
def case1(): scfg = maybe_forced_preset(presets.OpTracing) scfg.logconf = 'memop' # BUG: seems we must run a single job first otherwise it will hang run_seq(scfg.copy(output_dir=FLAGS.save_dir/'case1'), WTL.create("inception3", 100, 10), Pause.Wait, WTL.create("resnet50", 50, 10), Pause.Wait, WTL.create("inception3", 100, 20), # resnet50 seems to start earlier than inception3 and finishes too early, use more iters (40) WTL.create("resnet50", 50, 40))
def case1(): scfg = maybe_forced_preset(presets.MostEfficient) scfg.scheduler = 'rr' scfg.disable_wc = True scfg.env['SALUS_DISABLE_LANEMGR'] = '1' run_seq(scfg.copy(output_dir=FLAGS.save_dir/'case1'), WTL.create("inception3", 50, 145), Pause(15), WTL.create("inception3", 50, 75), Pause(15), WTL.create("inception3", 50, 35), )
def case2(): scfg = maybe_forced_preset(presets.MostEfficient) run_seq(scfg.copy(output_dir=FLAGS.save_dir/'case2'), WTL.create("inception3eval", 1, 1000), WTL.create("inception3eval", 1, 1000), WTL.create("inception3eval", 1, 1000), WTL.create("inception3eval", 1, 1000), WTL.create("inception3eval", 1, 1000), WTL.create("inception3eval", 1, 1000), WTL.create("inception3eval", 1, 1000), WTL.create("inception3eval", 1, 1000), WTL.create("inception3eval", 1, 1000), WTL.create("inception3eval", 1, 1000), )
def case2(): scfg = maybe_forced_preset(presets.OpTracing) scfg.logconf = 'memop' scfg.env['TF_CPP_MIN_VLOG_LEVEL'] = '1' scfg.env['TF_CPP_MIN_LOG_LEVEL'] = '' scfg.save_outerr = True run_seq(scfg.copy(output_dir=FLAGS.save_dir/'case2'), WTL.create("inception3", 100, 20), WTL.create("resnet50", 50, 20)) # filter the TF allocator output f = FLAGS.save_dir/'case1'/'server.stderr' with f.with_name('tfalloc.output').open('w') as file: grep = execute(['egrep', r"] (\+|-)", f.name], stdout=file, cwd=str(f.parent)) grep.wait()
def case1(): scfg = maybe_forced_preset(presets.MostEfficient) scfg.scheduler = 'fair' run_seq(scfg.copy(output_dir=FLAGS.save_dir/'case1'), WTL.create("inception3eval", 50, 250), Pause(5), WTL.create("inception3eval", 50, 250), Pause(5), WTL.create("inception3eval", 50, 250), Pause(5), WTL.create("inception3eval", 50, 250), Pause(5), WTL.create("inception3eval", 50, 250), )
def case1(): for rate in rates: wl = WTL.create("inception3eval", 1, 500, executor=Executor.TFDist) wl.env['SALUS_TFBENCH_EVAL_INTERVAL'] = str(1 / rate) wl.env['SALUS_TFBENCH_EVAL_RAND_FACTOR'] = '1' wl.env['SALUS_TFBENCH_EVAL_BLOCK'] = 'false' run_tfdist(FLAGS.save_dir/'case1'/str(rate), wl)
def main(argv): scfg = maybe_forced_preset(presets.MostEfficient) scfg.scheduler = 'pack' # Firstly run concurrently on salus run_seq(scfg.copy(output_dir=FLAGS.save_dir / "salus"), WTL.create("resnet101", 50, 47), WTL.create("resnet101", 50, 47), ) # Then run on tf run_seq(scfg.copy(output_dir=FLAGS.save_dir / "tf"), WTL.create("resnet101", 50, 47, executor=Executor.TF), Pause.Wait, WTL.create("resnet101", 50, 47, executor=Executor.TF), )
def select_workloads(argv): # type: (Iterable[str]) -> Iterable[(str, TBatchSize)] """Select workloads based on commandline Example: alexnet,vgg11 """ if not argv: names = WTL.known_workloads.keys() else: names = unique(( name for piece in argv for name in piece.split(',') ), stable=True) def getbs(name): if '_' in name: name, bs = name.split('_') bs = int(bs) return [(name, bs)] else: bss = WTL.from_name(name).available_batch_sizes() names = [name] * len(bss) return zip(names, bss) return [WTL.create(n, batch_size, 1, executor=Executor.TF) for name in names for n, batch_size in getbs(name)]
def do_mem(logdir, network, batch_size): """Do basic JCT on workload""" batch_num = 20 if network == "speech": batch_num = 5 logger.info(f'Measuring memory for {network}_{batch_size} for {batch_num} iter') ex = "salus" if FLAGS.use_salus else "tf" final_dst = logdir / ex / WTL.from_name(network).canonical_name(RunConfig(batch_size, batch_num, None)) with atomic_directory(final_dst) as outputdir: if not FLAGS.use_salus: logger.info(' Running on TF') wl = WTL.create(network, batch_size, batch_num, Executor.TF) wl.env['TF_CPP_MIN_VLOG_LEVEL'] = '1' wl.env['TF_CPP_MIN_LOG_LEVEL'] = '' run_tf(outputdir, wl) # filter and move file to a more convinent name for f in pathlib.Path(outputdir).iterdir(): with f.with_name('alloc.output').open('w') as file: grep = execute(['egrep', r"] (\+|-)", f.name], stdout=file, cwd=str(f.parent)) grep.wait() f.unlink() break else: scfg = maybe_forced_preset(presets.AllocProf) scfg.logconf = "memop" scfg.output_dir = outputdir server = SalusServer(scfg) with server.run(): logger.info(' Running on Salus') WTL.block_run(network, batch_size, batch_num, Executor.Salus, outputdir / 'rpc.output') return final_dst