def twoinfer(argv): # type: (Sequence[str]) -> None base_cfg = maybe_forced_preset(presets.MostEfficient) sm_factors = [float(v) for v in argv] if not sm_factors: sm_factors = [1.0, 1.5, 2.0, 2.5, 3.0] for idx, factor in enumerate(sm_factors): scfg = base_cfg.copy(output_dir=FLAGS.save_dir / "twoinfer" / "salus" / f"{factor:.2f}") scfg.extra_args += [ '--sm-factor', f'{factor:.2f}' ] with tempfile.TemporaryDirectory() as td: # create the foreground inference job wl1, pipe1 = create_infer(Executor.Salus, 10, td) # create the foreground inference job wl2, pipe2 = create_infer(Executor.Salus, 10, td) run_seq(scfg, wl1, # start the first job wl2, # start the second job # wait for both jobs to be ready RunFn(lambda *args, **kwargs: wait_on_pipe(pipe1)), RunFn(lambda *args, **kwargs: wait_on_pipe(pipe2)), # start 1st job RunFn(lambda *args, **kwargs: release_on_pipe(pipe1)), # release 2nd job RunFn(lambda *args, **kwargs: release_on_pipe(pipe2)), # run_seq automatically join all jobs at the end of the sequence )
def tfmps(argv): # type: (Sequence[str]) -> None batch_sizes = [int(v) for v in argv[1:]] if not batch_sizes: batch_sizes = [1, 2, 4, 8] for idx, bs in enumerate(batch_sizes): with tempfile.TemporaryDirectory() as td: # create a background training job train_wl, pipetrain = create_train(Executor.TF, idx, td) train_wl.extra_args += ['--min_mem'] # create the foreground inference job wl, pipe = create_infer(Executor.TF, bs, td) wl.extra_args += ['--min_mem'] run_tf(FLAGS.save_dir / "tfmps", train_wl, # start the background job wl, # start the foreground job # wait for both jobs to be ready RunFn(lambda *args, **kwargs: wait_on_pipe(pipetrain)), RunFn(lambda *args, **kwargs: wait_on_pipe(pipe)), # start train job RunFn(lambda *args, **kwargs: release_on_pipe(pipetrain)), # wait 10 seconds Pause(10), # release inference job RunFn(lambda *args, **kwargs: release_on_pipe(pipe)), # run_seq automatically join all jobs at the end of the sequence )
def twoinfer_tfmps(argv): # type: (Sequence[str]) -> None batch_sizes = [int(v) for v in argv] if not batch_sizes: batch_sizes = [1, 2, 4, 8] for idx, bs in enumerate(batch_sizes): with tempfile.TemporaryDirectory() as td: # create the foreground inference job wl1, pipe1 = create_infer(Executor.TF, bs, td) wl1.extra_args += ['--min_mem'] # create the foreground inference job wl2, pipe2 = create_infer(Executor.TF, bs, td) wl2.extra_args += ['--min_mem'] run_tf(FLAGS.save_dir / "twoinfer" / "tfmps", wl1, # start the background job wl2, # start the foreground job # wait for both jobs to be ready RunFn(lambda *args, **kwargs: wait_on_pipe(pipe1)), RunFn(lambda *args, **kwargs: wait_on_pipe(pipe2)), # start train job RunFn(lambda *args, **kwargs: release_on_pipe(pipe1)), # release inference job RunFn(lambda *args, **kwargs: release_on_pipe(pipe2)), # run_seq automatically join all jobs at the end of the sequence )
def tfmps2(argv): # type: (Sequence[str]) -> None name = "alexneteval" if len(argv) > 1: name = argv[0] batch_sizes = [int(v) for v in argv[1:]] if not batch_sizes: batch_sizes = [1, 2, 4, 8] batch_num = 300 # batch_sizes = [1, 2, 4, 8, 16, 32] # batch_sizes = [1024, 1536, 2048, 4096] for idx, bs in enumerate(batch_sizes): with tempfile.TemporaryDirectory() as td: # create a background training job, the batch number has no effect here, # only used to distinguish different runs trainWl = WTL.create('inception4', 50, 100 + idx, executor=Executor.TF) # make sure it runs long enough trainWl.env['SALUS_ITER_SECONDS'] = '300' trainWl.extra_args += ['--min_mem'] # create a pipe to signal trainWl pipetrain = str(pathlib.Path(td).joinpath('fifotrain')) os.mkfifo(pipetrain) trainWl.env['SALUS_WAIT_FOR_SIGNAL'] = pipetrain # create the foreground inference job wl = WTL.create(name, bs, batch_num, executor=Executor.TF) set_env(wl) wl.env['SALUS_ITER_SECONDS'] = '150' wl.extra_args += ['--min_mem'] pipe = str(pathlib.Path(td).joinpath('fifo')) os.mkfifo(pipe) wl.env['SALUS_WAIT_FOR_SIGNAL'] = pipe run_tf(FLAGS.save_dir / "tfmps2" / (name + "-inception4"), wl, # start the foreground job Pause(20), trainWl, # start the background job # wait for both jobs to be ready RunFn(lambda *args, **kwargs: wait_on_pipe(pipetrain)), RunFn(lambda *args, **kwargs: wait_on_pipe(pipe)), # start train job RunFn(lambda *args, **kwargs: release_on_pipe(pipetrain)), # wait 10 seconds Pause(10), # release inference job RunFn(lambda *args, **kwargs: release_on_pipe(pipe)), # run_seq automatically join all jobs at the end of the sequence )
def diff(argv): # type: (Sequence[str]) -> None scfg = maybe_forced_preset(presets.MostEfficient) scfg.logconf = 'disable' # all non-integer argv are treated as names names = [] batch_sizes = [] for arg in argv: try: batch_sizes.append(int(arg)) except ValueError: names.append(arg) # create jobs batch_num = 100 # batch_sizes = [1, 2, 4, 8, 16, 32] # batch_sizes = [1024, 1536, 2048, 4096] for bs in batch_sizes: with tempfile.TemporaryDirectory() as td: wls = [] pipes = [] for name in names: if not name.endswith('eval'): raise ValueError('Not an inference workload!!!') wl = WTL.create(name, bs, batch_num, executor=Executor.Salus) set_env(wl) wls.append(wl) # also add a small pause to make sure every job starts pipe = str(pathlib.Path(td).joinpath(wl.canonical_name).with_suffix('.pipe')) os.mkfifo(pipe) pipes.append(pipes) # wait all jobs to be ready wls.append(RunFn(lambda workloads, **kwargs: [wait_on_pipe(pipe) for pipe in pipes] and None)) # signal all jobs to start wls.append(RunFn(lambda workloads, **kwargs: [release_on_pipe(pipe) for pipe in pipes] and None)) run_seq(scfg.copy(output_dir=FLAGS.save_dir / '-'.join(names)), *wls)
def same_pri_salus(argv): # type: (Sequence[str]) -> None """Inversed priority for training and inference""" base_cfg = maybe_forced_preset(presets.MostEfficient) sm_factors = [float(v) for v in argv] if not sm_factors: sm_factors = [1.0, 1.5, 2.0, 2.5, 3.0] for idx, factor in enumerate(sm_factors): scfg = base_cfg.copy(output_dir=FLAGS.save_dir / "same_pri" / f"{factor:.2f}") scfg.extra_args += [ '--sm-factor', f'{factor:.2f}' ] with tempfile.TemporaryDirectory() as td: # create a background training job train_wl, pipetrain = create_train(Executor.Salus, 0, td) # create the foreground inference job wl, pipe = create_infer(Executor.Salus, 10, td) wl.extra_args += [ '--eval_sched_priority', '20' ] run_seq(scfg, train_wl, # start the background job wl, # start the foreground job # wait for both jobs to be ready RunFn(lambda *args, **kwargs: wait_on_pipe(pipetrain)), RunFn(lambda *args, **kwargs: wait_on_pipe(pipe)), # start train job RunFn(lambda *args, **kwargs: release_on_pipe(pipetrain)), # wait 10 seconds Pause(10), # release inference job RunFn(lambda *args, **kwargs: release_on_pipe(pipe)), # run_seq automatically join all jobs at the end of the sequence )
def train_alone(argv): """Run training workload alone take note of SM usage""" sm_factors = [float(v) for v in argv] if not sm_factors: sm_factors = [1.0, 1.5, 2.0, 2.5, 3.0] logger.info(f"Running Salus with sm factors: {sm_factors}") # run salus for factor in sm_factors: with tempfile.TemporaryDirectory() as td: scfg = maybe_forced_preset(presets.OpTracing) scfg.logconf = 'smtracing' scfg.extra_args += [ '--sm-factor', f'{factor:.2f}' ] logger.info(f"Running Salus with sm factor: {factor}") # the background training job wl, pipe = create_train(Executor.Salus, 0, td) run_seq(scfg.copy(output_dir=FLAGS.save_dir / "alone" / f"{factor:.2f}"), wl, RunFn(lambda *args, **kwargs: wait_on_pipe(pipe)), RunFn(lambda *args, **kwargs: release_on_pipe(pipe)))
def salus(argv): # type: (Sequence[str]) -> None scfg = maybe_forced_preset(presets.MostEfficient) name = "alexneteval" if len(argv) > 1: name = argv[0] batch_sizes = [int(v) for v in argv[1:]] if not batch_sizes: batch_sizes = [1, 2, 4, 8] batch_num = 300 # batch_sizes = [1, 2, 4, 8, 16, 32] # batch_sizes = [1024, 1536, 2048, 4096] for idx, bs in enumerate(batch_sizes): with tempfile.TemporaryDirectory() as td: # create a background training job train_wl, pipetrain = create_train(Executor.Salus, idx, td) # create the foreground inference job wl, pipe = create_infer(Executor.Salus, name, bs, batch_num, td) run_seq(scfg.copy(output_dir=FLAGS.save_dir / "salus" / (name + "-inception4")), train_wl, # start the background job wl, # start the foreground job # wait for both jobs to be ready RunFn(lambda *args, **kwargs: wait_on_pipe(pipetrain)), RunFn(lambda *args, **kwargs: wait_on_pipe(pipe)), # start train job RunFn(lambda *args, **kwargs: release_on_pipe(pipetrain)), # wait 10 seconds Pause(10), # release inference job RunFn(lambda *args, **kwargs: release_on_pipe(pipe)), # run_seq automatically join all jobs at the end of the sequence )
def main(argv): # type: (Sequence[str]) -> None scfg = maybe_forced_preset(presets.MostEfficient) scfg.scheduler = 'pack' cases = (Cases[c] for c in argv) if argv else Cases templates = list(gen_workload_list(FLAGS.select_wl)) if FLAGS.total_num > 0: templates = templates[:FLAGS.total_num] logger.info("Selected the following list of workloads") for wtl, rcfg in templates: logger.info(f" {wtl.canonical_name(rcfg)} of {rcfg.batch_num} iters") # Check if workloads have the info we need for wtl, rcfg in templates: for field in ['jct', 'persistmem']: if wtl.geometry(rcfg, Executor.Salus)[field] is None: raise ValueError(f'Missing {field} data for workload {wtl.canonical_name(rcfg)} of {rcfg.batch_num} iters, available geometries: {wtl._geometries}') for case in cases: logdir = FLAGS.save_dir / case.name # create workload instances workloads = (wtl._create_from_rcfg(rcfg, Executor.Salus) for wtl, rcfg in templates) # sort workload according to case key, desc = case.value workloads = sorted(workloads, key=lambda w: w.geometry[key], reverse=desc) def limit_concurrent(wls): # type: (Iterable[Workload]) -> None """Wait for something to finish""" gone, alive = SalusServer.wait_workloads(wls, timeout=0) while len(alive) >= FLAGS.concurrent_jobs: gone, alive = SalusServer.wait_workloads(wls, timeout=0) time.sleep(.25) actions = chain(*( [w, RunFn(limit_concurrent)] for w in workloads )) run_seq(scfg.copy(output_dir=logdir), *actions)