コード例 #1
0
ファイル: card304.py プロジェクト: vycezhong/Salus-1
def tfmps2(argv):
    # type: (Sequence[str]) -> None
    name = "alexneteval"
    if len(argv) > 1:
        name = argv[0]
    batch_sizes = [int(v) for v in argv[1:]]

    if not batch_sizes:
        batch_sizes = [1, 2, 4, 8]

    batch_num = 300
    # batch_sizes = [1, 2, 4, 8, 16, 32]
    # batch_sizes = [1024, 1536, 2048, 4096]
    for idx, bs in enumerate(batch_sizes):
        with tempfile.TemporaryDirectory() as td:
            # create a background training job, the batch number has no effect here,
            # only used to distinguish different runs
            trainWl = WTL.create('inception4', 50, 100 + idx, executor=Executor.TF)
            # make sure it runs long enough
            trainWl.env['SALUS_ITER_SECONDS'] = '300'
            trainWl.extra_args += ['--min_mem']

            # create a pipe to signal trainWl
            pipetrain = str(pathlib.Path(td).joinpath('fifotrain'))
            os.mkfifo(pipetrain)
            trainWl.env['SALUS_WAIT_FOR_SIGNAL'] = pipetrain

            # create the foreground inference job
            wl = WTL.create(name, bs, batch_num, executor=Executor.TF)
            set_env(wl)
            wl.env['SALUS_ITER_SECONDS'] = '150'
            wl.extra_args += ['--min_mem']

            pipe = str(pathlib.Path(td).joinpath('fifo'))
            os.mkfifo(pipe)
            wl.env['SALUS_WAIT_FOR_SIGNAL'] = pipe

            run_tf(FLAGS.save_dir / "tfmps2" / (name + "-inception4"),
                   wl,  # start the foreground job
                   Pause(20),
                   trainWl,  # start the background job
                   # wait for both jobs to be ready
                   RunFn(lambda *args, **kwargs: wait_on_pipe(pipetrain)),
                   RunFn(lambda *args, **kwargs: wait_on_pipe(pipe)),
                   # start train job
                   RunFn(lambda *args, **kwargs: release_on_pipe(pipetrain)),
                   # wait 10 seconds
                   Pause(10),
                   # release inference job
                   RunFn(lambda *args, **kwargs: release_on_pipe(pipe)),
                   # run_seq automatically join all jobs at the end of the sequence
                   )
コード例 #2
0
def do_mem(logdir, network, batch_size):
    """Do basic JCT on workload"""
    batch_num = 20
    if network == "speech":
        batch_num = 5

    logger.info(f'Measuring memory for {network}_{batch_size} for {batch_num} iter')

    ex = "salus" if FLAGS.use_salus else "tf"
    final_dst = logdir / ex / WTL.from_name(network).canonical_name(RunConfig(batch_size, batch_num, None))
    with atomic_directory(final_dst) as outputdir:
        if not FLAGS.use_salus:
            logger.info('    Running on TF')
            wl = WTL.create(network, batch_size, batch_num, Executor.TF)
            wl.env['TF_CPP_MIN_VLOG_LEVEL'] = '1'
            wl.env['TF_CPP_MIN_LOG_LEVEL'] = ''
            run_tf(outputdir, wl)
            # filter and move file to a more convinent name
            for f in pathlib.Path(outputdir).iterdir():
                with f.with_name('alloc.output').open('w') as file:
                    grep = execute(['egrep', r"] (\+|-)", f.name], stdout=file, cwd=str(f.parent))
                    grep.wait()
                f.unlink()
                break
        else:
            scfg = maybe_forced_preset(presets.AllocProf)
            scfg.logconf = "memop"
            scfg.output_dir = outputdir
            server = SalusServer(scfg)
            with server.run():
                logger.info('    Running on Salus')
                WTL.block_run(network, batch_size, batch_num, Executor.Salus, outputdir / 'rpc.output')

    return final_dst
コード例 #3
0
ファイル: bs_lat_tput.py プロジェクト: vycezhong/Salus-1
def set_env(wl):
    wl.env['SALUS_TFBENCH_EVAL_INTERVAL'] = '0'
    wl.env['SALUS_TFBENCH_EVAL_RAND_FACTOR'] = '0'
    wl.env['SALUS_TFBENCH_EVAL_BLOCK'] = 'true'

    model_dir = pathlib.Path('~/../symbiotic/peifeng/tf_cnn_benchmarks_models/legacy_checkpoint_models')
    model_dir = model_dir.expanduser().resolve()
    wl.env['SALUS_TFBENCH_EVAL_MODEL_DIR'] = model_dir
コード例 #4
0
ファイル: smtracing.py プロジェクト: vycezhong/Salus-1
def create_train(executor, idx, td):
    # the batch number has no effect here, only used to distinguish different runs
    train_wl = WTL.create('inception4', 50, 100 + idx, executor=executor)
    # make sure it runs long enough
    train_wl.env['SALUS_ITER_SECONDS'] = '300'

    # create a pipe to signal train_wl
    pipetrain = str(pathlib.Path(td).joinpath('fifotrain'))
    os.mkfifo(pipetrain)
    train_wl.env['SALUS_WAIT_FOR_SIGNAL'] = pipetrain
    return train_wl, pipetrain
コード例 #5
0
ファイル: tune_pending.py プロジェクト: vycezhong/Salus-1
def create_train(executor, idx, td=None):
    # the batch number has no effect here, only used to distinguish different runs
    train_wl = WTL.create('inception4', 50, 100 + idx, executor=executor)
    # make sure it runs long enough
    train_wl.env['SALUS_ITER_SECONDS'] = '300'

    if td is not None:
        # create a pipe to signal train_wl
        pipetrain = str(pathlib.Path(td) / f'{train_wl.canonical_name}-{random_id()}-fifo')
        os.mkfifo(pipetrain)
        train_wl.env['SALUS_WAIT_FOR_SIGNAL'] = pipetrain
        return train_wl, pipetrain
    return train_wl
コード例 #6
0
ファイル: card308.py プロジェクト: vycezhong/Salus-1
def main(argv):
    # type: (Sequence[str]) -> None

    model_dir = pathlib.Path('~/../symbiotic/peifeng/tf_cnn_benchmarks_models/legacy_checkpoint_models')
    model_dir = model_dir.expanduser().resolve()

    saved_model_dir = pathlib.Path('~/../symbiotic/peifeng/tf_cnn_benchmarks_models/saved_models')
    saved_model_dir = saved_model_dir.expanduser().resolve()

    for wl in select_workloads(argv, batch_size=1, batch_num=1, executor=Executor.TF):
        if wl.wtl.runnerCls is not TFBenchmarkRunner:
            logger.info(f'Skipping {wl.name}')
            continue
        if not wl.name.endswith('eval'):
            logger.info(f'Skipping {wl.name}')
            continue

        logger.info(f"**** Saving SavedModel: {wl.canonical_name}")
        logger.info(f"**** Location: {FLAGS.save_dir}")

        wl.env['SALUS_TFBENCH_EVAL_MODEL_DIR'] = str(model_dir)
        wl.env['SALUS_TFBENCH_EVAL_SAVED_MODEL_DIR'] = str(saved_model_dir)
        run_tf(FLAGS.save_dir, wl)
コード例 #7
0
ファイル: smtracing.py プロジェクト: vycezhong/Salus-1
def create_infer(executor, name, bs, batch_num, td):
    wl = WTL.create(name, bs, batch_num, executor=executor)
    set_env(wl)
    wl.env['SALUS_ITER_SECONDS'] = '150'
    wl.extra_args += [
        '--eval_interval_secs=0.02',
        # '--eval_interval_random_factor=5'
    ]

    pipe = str(pathlib.Path(td).joinpath('fifo'))
    os.mkfifo(pipe)
    wl.env['SALUS_WAIT_FOR_SIGNAL'] = pipe

    return wl, pipe
コード例 #8
0
ファイル: exp15.py プロジェクト: vycezhong/Salus-1
def load_trace(path, ex):
    path = pathlib.Path(path)
    with path.open() as f:
        reader = csv.DictReader(f)

        def create_from_row(row):
            name, bs = row['model_name'].split('_')
            bs = try_with_default(int, bs, ValueError)(bs)
            bn = int(row['iterations'])
            submit_time = int(row['submit_time'])
            if FLAGS.scale_down > 1:
                bn = bn // FLAGS.scale_down
                submit_time = submit_time / FLAGS.scale_down
            return WTL.create(name, bs, bn, ex), submit_time, row
        return [create_from_row(row) for row in reader]
コード例 #9
0
ファイル: tune_pending.py プロジェクト: vycezhong/Salus-1
def create_infer(executor, bs, td=None):
    wl = WTL.create('vgg11eval', bs, 300, executor=executor)
    set_env(wl)
    wl.env['SALUS_ITER_SECONDS'] = '150'
    wl.extra_args += [
        # '--eval_interval_secs=0.02',
        # '--eval_interval_random_factor=5'
    ]

    if td is not None:
        pipe = str(pathlib.Path(td) / f'{wl.canonical_name}-{random_id()}-fifo')
        os.mkfifo(pipe)
        wl.env['SALUS_WAIT_FOR_SIGNAL'] = pipe
        return wl, pipe

    return wl
コード例 #10
0
def load_trace(path, ex):
    path = pathlib.Path(path)
    with path.open() as f:
        reader = csv.DictReader(f)

        def create_from_row(row):
            name, bs = row['model_name'].split('_')
            bs = try_with_default(int, bs, ValueError)(bs)
            bn = int(row['iterations'])
            submit_time = int(row['submit_time'])
            if FLAGS.scale_down > 1:
                bn = bn // FLAGS.scale_down
                submit_time = submit_time / FLAGS.scale_down
            w = WTL.create(name, bs, bn, ex)
            w.env['SALUS_TOTAL_TIME'] = row['duration']  # seconds
            w.env['TF_CPP_MIN_LOG_LEVEL'] = ''  # we need LOG(INFO) from TF code
            return w, submit_time, row
        return [create_from_row(row) for row in reader]
コード例 #11
0
def do_mem(logdir, network, batch_size):
    """Do basic JCT on workload"""
    batch_num = 10

    logger.info(f'Saving model checkpoint for {network}_{batch_size} for {batch_num} iter')

    final_dst = logdir / WTL.from_name(network).canonical_name(RunConfig(batch_size, batch_num, None))

    with atomic_directory(final_dst) as outputdir:
        logger.info('    Running on TF')
        wl = WTL.create(network, batch_size, batch_num, Executor.TF)
        wl.env['SALUS_SAVE_MODEL'] = '1'

        model_dir = pathlib.Path('~/../symbiotic/peifeng/tf_cnn_benchmarks_models/legacy_checkpoint_models')
        model_dir = model_dir.expanduser().resolve()
        wl.env['SALUS_TFBENCH_EVAL_MODEL_DIR'] = str(model_dir)

        run_tf(outputdir, wl)
    return final_dst
コード例 #12
0
def diff(argv):
    # type: (Sequence[str]) -> None
    scfg = maybe_forced_preset(presets.MostEfficient)
    scfg.logconf = 'disable'

    # all non-integer argv are treated as names
    names = []
    batch_sizes = []
    for arg in argv:
        try:
            batch_sizes.append(int(arg))
        except ValueError:
            names.append(arg)

    # create jobs
    batch_num = 100
    # batch_sizes = [1, 2, 4, 8, 16, 32]
    # batch_sizes = [1024, 1536, 2048, 4096]
    for bs in batch_sizes:
        with tempfile.TemporaryDirectory() as td:
            wls = []
            pipes = []
            for name in names:
                if not name.endswith('eval'):
                    raise ValueError('Not an inference workload!!!')
                wl = WTL.create(name, bs, batch_num, executor=Executor.Salus)
                set_env(wl)
                wls.append(wl)

                # also add a small pause to make sure every job starts
                pipe = str(pathlib.Path(td).joinpath(wl.canonical_name).with_suffix('.pipe'))
                os.mkfifo(pipe)
                pipes.append(pipes)

            # wait all jobs to be ready
            wls.append(RunFn(lambda workloads, **kwargs: [wait_on_pipe(pipe) for pipe in pipes] and None))
            # signal all jobs to start
            wls.append(RunFn(lambda workloads, **kwargs: [release_on_pipe(pipe) for pipe in pipes] and None))

            run_seq(scfg.copy(output_dir=FLAGS.save_dir / '-'.join(names)),
                    *wls)
コード例 #13
0
def do_mem(logdir, network, batch_size):
    """Do basic JCT on workload"""
    batch_num = 10

    logger.info(f'Saving model checkpoint for {network}_{batch_size} for {batch_num} iter')

    final_dst = logdir / 'tf' / WTL.from_name(network).canonical_name(RunConfig(batch_size, batch_num, None))

    with atomic_directory(final_dst) as outputdir:
        logger.info('    Running on TF')
        wl = WTL.create(network, batch_size, batch_num, Executor.TF)
        wl.env['SALUS_SAVE_MODEL'] = '1'
        run_tf(outputdir, wl)
        # filter and move file to a more convinent name
        for f in pathlib.Path(outputdir).iterdir():
            with f.with_name('alloc.output').open('w') as file:
                grep = execute(['egrep', r"] (\+|-)", f.name], stdout=file, cwd=str(f.parent))
                grep.wait()
            f.unlink()
            break
    return final_dst