예제 #1
0
def case1():
    for rate in rates:
        wl = WTL.create("inception3eval", 1, 500, executor=Executor.TFDist)
        wl.env['SALUS_TFBENCH_EVAL_INTERVAL'] = str(1 / rate)
        wl.env['SALUS_TFBENCH_EVAL_RAND_FACTOR'] = '1'
        wl.env['SALUS_TFBENCH_EVAL_BLOCK'] = 'false'
        run_tfdist(FLAGS.save_dir/'case1'/str(rate), wl)
예제 #2
0
파일: card13.py 프로젝트: vycezhong/Salus-1
def case1():
    scfg = maybe_forced_preset(presets.MostEfficient)

    run_tfdist(FLAGS.save_dir/'case1',
               WTL.create("inception3eval", 50, 50, executor=Executor.TF),
               WTL.create("inception3eval", 50, 50, executor=Executor.TFDist)
               )

    run_seq(scfg.copy(output_dir=FLAGS.save_dir/'case1'),
            WTL.create("inception3eval", 50, 50))
예제 #3
0
def case3(argv):
    model, bs, bn = 'resnet50', 50, 500
    name = inspect.currentframe().f_code.co_name

    # first run one along to get JCT
    run_tfdist(FLAGS.save_dir/name, WTL.create(model, bs, bn, executor=Executor.TFDist))

    # create 300 vae
    scfg = maybe_forced_preset(presets.MostEfficient)
    scfg.scheduler = 'pack'

    wls = [WTL.create(model, bs, bn) for _ in range(300)]
    run_seq(scfg.copy(output_dir=FLAGS.save_dir/name), *wls)
예제 #4
0
def tfdist(argv):
    # type: (Sequence[str]) -> None
    name = "alexneteval"
    if len(argv) > 1:
        name = argv[0]
    batch_sizes = [int(v) for v in argv[1:]]

    if not batch_sizes:
        batch_sizes = [1, 2, 4, 8]

    batch_num = 300
    # batch_sizes = [1, 2, 4, 8, 16, 32]
    # batch_sizes = [1024, 1536, 2048, 4096]
    for idx, bs in enumerate(batch_sizes):
        with tempfile.TemporaryDirectory() as td:
            # create a background training job, the batch number has no effect here,
            # only used to distinguish different runs
            trainWl = WTL.create('inception4', 50, 100 + idx, executor=Executor.TFDist)
            # make sure it runs long enough
            trainWl.env['SALUS_ITER_SECONDS'] = '300'

            # create a pipe to signal trainWl
            pipetrain = str(pathlib.Path(td).joinpath('fifotrain'))
            os.mkfifo(pipetrain)
            trainWl.env['SALUS_WAIT_FOR_SIGNAL'] = pipetrain

            # create the foreground inference job
            wl = WTL.create(name, bs, batch_num, executor=Executor.TFDist)
            set_env(wl)
            wl.env['SALUS_ITER_SECONDS'] = '150'

            pipe = str(pathlib.Path(td).joinpath('fifo'))
            os.mkfifo(pipe)
            wl.env['SALUS_WAIT_FOR_SIGNAL'] = pipe

            run_tfdist(FLAGS.save_dir / "tfdist" / (name + "-inception4"),
                       trainWl,  # start the background job
                       wl,  # start the foreground job
                       # wait for both jobs to be ready
                       RunFn(lambda *args, **kwargs: wait_on_pipe(pipetrain)),
                       RunFn(lambda *args, **kwargs: wait_on_pipe(pipe)),
                       # start train job
                       RunFn(lambda *args, **kwargs: release_on_pipe(pipetrain)),
                       # wait 10 seconds
                       Pause(10),
                       # release inference job
                       RunFn(lambda *args, **kwargs: release_on_pipe(pipe)),
                       # run_seq automatically join all jobs at the end of the sequence
                       )
예제 #5
0
def case3():
    run_tfdist(FLAGS.save_dir/'case3', WTL.create("inception3eval", 1, 1000, executor=Executor.TFDist))
예제 #6
0
def test():
    run_tfdist(FLAGS.save_dir,
               WTL.create("inception4", 25, 1, executor=Executor.TFDist),
               Pause.Wait,
               WTL.create("inception3", 50, 1, executor=Executor.TFDist))