def tfmps2(argv): # type: (Sequence[str]) -> None name = "alexneteval" if len(argv) > 1: name = argv[0] batch_sizes = [int(v) for v in argv[1:]] if not batch_sizes: batch_sizes = [1, 2, 4, 8] batch_num = 300 # batch_sizes = [1, 2, 4, 8, 16, 32] # batch_sizes = [1024, 1536, 2048, 4096] for idx, bs in enumerate(batch_sizes): with tempfile.TemporaryDirectory() as td: # create a background training job, the batch number has no effect here, # only used to distinguish different runs trainWl = WTL.create('inception4', 50, 100 + idx, executor=Executor.TF) # make sure it runs long enough trainWl.env['SALUS_ITER_SECONDS'] = '300' trainWl.extra_args += ['--min_mem'] # create a pipe to signal trainWl pipetrain = str(pathlib.Path(td).joinpath('fifotrain')) os.mkfifo(pipetrain) trainWl.env['SALUS_WAIT_FOR_SIGNAL'] = pipetrain # create the foreground inference job wl = WTL.create(name, bs, batch_num, executor=Executor.TF) set_env(wl) wl.env['SALUS_ITER_SECONDS'] = '150' wl.extra_args += ['--min_mem'] pipe = str(pathlib.Path(td).joinpath('fifo')) os.mkfifo(pipe) wl.env['SALUS_WAIT_FOR_SIGNAL'] = pipe run_tf(FLAGS.save_dir / "tfmps2" / (name + "-inception4"), wl, # start the foreground job Pause(20), trainWl, # start the background job # wait for both jobs to be ready RunFn(lambda *args, **kwargs: wait_on_pipe(pipetrain)), RunFn(lambda *args, **kwargs: wait_on_pipe(pipe)), # start train job RunFn(lambda *args, **kwargs: release_on_pipe(pipetrain)), # wait 10 seconds Pause(10), # release inference job RunFn(lambda *args, **kwargs: release_on_pipe(pipe)), # run_seq automatically join all jobs at the end of the sequence )
def do_mem(logdir, network, batch_size): """Do basic JCT on workload""" batch_num = 20 if network == "speech": batch_num = 5 logger.info(f'Measuring memory for {network}_{batch_size} for {batch_num} iter') ex = "salus" if FLAGS.use_salus else "tf" final_dst = logdir / ex / WTL.from_name(network).canonical_name(RunConfig(batch_size, batch_num, None)) with atomic_directory(final_dst) as outputdir: if not FLAGS.use_salus: logger.info(' Running on TF') wl = WTL.create(network, batch_size, batch_num, Executor.TF) wl.env['TF_CPP_MIN_VLOG_LEVEL'] = '1' wl.env['TF_CPP_MIN_LOG_LEVEL'] = '' run_tf(outputdir, wl) # filter and move file to a more convinent name for f in pathlib.Path(outputdir).iterdir(): with f.with_name('alloc.output').open('w') as file: grep = execute(['egrep', r"] (\+|-)", f.name], stdout=file, cwd=str(f.parent)) grep.wait() f.unlink() break else: scfg = maybe_forced_preset(presets.AllocProf) scfg.logconf = "memop" scfg.output_dir = outputdir server = SalusServer(scfg) with server.run(): logger.info(' Running on Salus') WTL.block_run(network, batch_size, batch_num, Executor.Salus, outputdir / 'rpc.output') return final_dst
def set_env(wl): wl.env['SALUS_TFBENCH_EVAL_INTERVAL'] = '0' wl.env['SALUS_TFBENCH_EVAL_RAND_FACTOR'] = '0' wl.env['SALUS_TFBENCH_EVAL_BLOCK'] = 'true' model_dir = pathlib.Path('~/../symbiotic/peifeng/tf_cnn_benchmarks_models/legacy_checkpoint_models') model_dir = model_dir.expanduser().resolve() wl.env['SALUS_TFBENCH_EVAL_MODEL_DIR'] = model_dir
def create_train(executor, idx, td): # the batch number has no effect here, only used to distinguish different runs train_wl = WTL.create('inception4', 50, 100 + idx, executor=executor) # make sure it runs long enough train_wl.env['SALUS_ITER_SECONDS'] = '300' # create a pipe to signal train_wl pipetrain = str(pathlib.Path(td).joinpath('fifotrain')) os.mkfifo(pipetrain) train_wl.env['SALUS_WAIT_FOR_SIGNAL'] = pipetrain return train_wl, pipetrain
def create_train(executor, idx, td=None): # the batch number has no effect here, only used to distinguish different runs train_wl = WTL.create('inception4', 50, 100 + idx, executor=executor) # make sure it runs long enough train_wl.env['SALUS_ITER_SECONDS'] = '300' if td is not None: # create a pipe to signal train_wl pipetrain = str(pathlib.Path(td) / f'{train_wl.canonical_name}-{random_id()}-fifo') os.mkfifo(pipetrain) train_wl.env['SALUS_WAIT_FOR_SIGNAL'] = pipetrain return train_wl, pipetrain return train_wl
def main(argv): # type: (Sequence[str]) -> None model_dir = pathlib.Path('~/../symbiotic/peifeng/tf_cnn_benchmarks_models/legacy_checkpoint_models') model_dir = model_dir.expanduser().resolve() saved_model_dir = pathlib.Path('~/../symbiotic/peifeng/tf_cnn_benchmarks_models/saved_models') saved_model_dir = saved_model_dir.expanduser().resolve() for wl in select_workloads(argv, batch_size=1, batch_num=1, executor=Executor.TF): if wl.wtl.runnerCls is not TFBenchmarkRunner: logger.info(f'Skipping {wl.name}') continue if not wl.name.endswith('eval'): logger.info(f'Skipping {wl.name}') continue logger.info(f"**** Saving SavedModel: {wl.canonical_name}") logger.info(f"**** Location: {FLAGS.save_dir}") wl.env['SALUS_TFBENCH_EVAL_MODEL_DIR'] = str(model_dir) wl.env['SALUS_TFBENCH_EVAL_SAVED_MODEL_DIR'] = str(saved_model_dir) run_tf(FLAGS.save_dir, wl)
def create_infer(executor, name, bs, batch_num, td): wl = WTL.create(name, bs, batch_num, executor=executor) set_env(wl) wl.env['SALUS_ITER_SECONDS'] = '150' wl.extra_args += [ '--eval_interval_secs=0.02', # '--eval_interval_random_factor=5' ] pipe = str(pathlib.Path(td).joinpath('fifo')) os.mkfifo(pipe) wl.env['SALUS_WAIT_FOR_SIGNAL'] = pipe return wl, pipe
def load_trace(path, ex): path = pathlib.Path(path) with path.open() as f: reader = csv.DictReader(f) def create_from_row(row): name, bs = row['model_name'].split('_') bs = try_with_default(int, bs, ValueError)(bs) bn = int(row['iterations']) submit_time = int(row['submit_time']) if FLAGS.scale_down > 1: bn = bn // FLAGS.scale_down submit_time = submit_time / FLAGS.scale_down return WTL.create(name, bs, bn, ex), submit_time, row return [create_from_row(row) for row in reader]
def create_infer(executor, bs, td=None): wl = WTL.create('vgg11eval', bs, 300, executor=executor) set_env(wl) wl.env['SALUS_ITER_SECONDS'] = '150' wl.extra_args += [ # '--eval_interval_secs=0.02', # '--eval_interval_random_factor=5' ] if td is not None: pipe = str(pathlib.Path(td) / f'{wl.canonical_name}-{random_id()}-fifo') os.mkfifo(pipe) wl.env['SALUS_WAIT_FOR_SIGNAL'] = pipe return wl, pipe return wl
def load_trace(path, ex): path = pathlib.Path(path) with path.open() as f: reader = csv.DictReader(f) def create_from_row(row): name, bs = row['model_name'].split('_') bs = try_with_default(int, bs, ValueError)(bs) bn = int(row['iterations']) submit_time = int(row['submit_time']) if FLAGS.scale_down > 1: bn = bn // FLAGS.scale_down submit_time = submit_time / FLAGS.scale_down w = WTL.create(name, bs, bn, ex) w.env['SALUS_TOTAL_TIME'] = row['duration'] # seconds w.env['TF_CPP_MIN_LOG_LEVEL'] = '' # we need LOG(INFO) from TF code return w, submit_time, row return [create_from_row(row) for row in reader]
def do_mem(logdir, network, batch_size): """Do basic JCT on workload""" batch_num = 10 logger.info(f'Saving model checkpoint for {network}_{batch_size} for {batch_num} iter') final_dst = logdir / WTL.from_name(network).canonical_name(RunConfig(batch_size, batch_num, None)) with atomic_directory(final_dst) as outputdir: logger.info(' Running on TF') wl = WTL.create(network, batch_size, batch_num, Executor.TF) wl.env['SALUS_SAVE_MODEL'] = '1' model_dir = pathlib.Path('~/../symbiotic/peifeng/tf_cnn_benchmarks_models/legacy_checkpoint_models') model_dir = model_dir.expanduser().resolve() wl.env['SALUS_TFBENCH_EVAL_MODEL_DIR'] = str(model_dir) run_tf(outputdir, wl) return final_dst
def diff(argv): # type: (Sequence[str]) -> None scfg = maybe_forced_preset(presets.MostEfficient) scfg.logconf = 'disable' # all non-integer argv are treated as names names = [] batch_sizes = [] for arg in argv: try: batch_sizes.append(int(arg)) except ValueError: names.append(arg) # create jobs batch_num = 100 # batch_sizes = [1, 2, 4, 8, 16, 32] # batch_sizes = [1024, 1536, 2048, 4096] for bs in batch_sizes: with tempfile.TemporaryDirectory() as td: wls = [] pipes = [] for name in names: if not name.endswith('eval'): raise ValueError('Not an inference workload!!!') wl = WTL.create(name, bs, batch_num, executor=Executor.Salus) set_env(wl) wls.append(wl) # also add a small pause to make sure every job starts pipe = str(pathlib.Path(td).joinpath(wl.canonical_name).with_suffix('.pipe')) os.mkfifo(pipe) pipes.append(pipes) # wait all jobs to be ready wls.append(RunFn(lambda workloads, **kwargs: [wait_on_pipe(pipe) for pipe in pipes] and None)) # signal all jobs to start wls.append(RunFn(lambda workloads, **kwargs: [release_on_pipe(pipe) for pipe in pipes] and None)) run_seq(scfg.copy(output_dir=FLAGS.save_dir / '-'.join(names)), *wls)
def do_mem(logdir, network, batch_size): """Do basic JCT on workload""" batch_num = 10 logger.info(f'Saving model checkpoint for {network}_{batch_size} for {batch_num} iter') final_dst = logdir / 'tf' / WTL.from_name(network).canonical_name(RunConfig(batch_size, batch_num, None)) with atomic_directory(final_dst) as outputdir: logger.info(' Running on TF') wl = WTL.create(network, batch_size, batch_num, Executor.TF) wl.env['SALUS_SAVE_MODEL'] = '1' run_tf(outputdir, wl) # filter and move file to a more convinent name for f in pathlib.Path(outputdir).iterdir(): with f.with_name('alloc.output').open('w') as file: grep = execute(['egrep', r"] (\+|-)", f.name], stdout=file, cwd=str(f.parent)) grep.wait() f.unlink() break return final_dst