예제 #1
0
def twoinfer(argv):
    # type: (Sequence[str]) -> None
    base_cfg = maybe_forced_preset(presets.MostEfficient)

    sm_factors = [float(v) for v in argv]
    if not sm_factors:
        sm_factors = [1.0, 1.5, 2.0, 2.5, 3.0]

    for idx, factor in enumerate(sm_factors):
        scfg = base_cfg.copy(output_dir=FLAGS.save_dir / "twoinfer" / "salus" / f"{factor:.2f}")
        scfg.extra_args += [
            '--sm-factor', f'{factor:.2f}'
        ]
        with tempfile.TemporaryDirectory() as td:
            # create the foreground inference job
            wl1, pipe1 = create_infer(Executor.Salus, 10, td)

            # create the foreground inference job
            wl2, pipe2 = create_infer(Executor.Salus, 10, td)

            run_seq(scfg,
                    wl1,  # start the first job
                    wl2,  # start the second job
                    # wait for both jobs to be ready
                    RunFn(lambda *args, **kwargs: wait_on_pipe(pipe1)),
                    RunFn(lambda *args, **kwargs: wait_on_pipe(pipe2)),
                    # start 1st job
                    RunFn(lambda *args, **kwargs: release_on_pipe(pipe1)),
                    # release 2nd job
                    RunFn(lambda *args, **kwargs: release_on_pipe(pipe2)),
                    # run_seq automatically join all jobs at the end of the sequence
                    )
예제 #2
0
def tfmps(argv):
    # type: (Sequence[str]) -> None
    batch_sizes = [int(v) for v in argv[1:]]

    if not batch_sizes:
        batch_sizes = [1, 2, 4, 8]

    for idx, bs in enumerate(batch_sizes):
        with tempfile.TemporaryDirectory() as td:
            # create a background training job
            train_wl, pipetrain = create_train(Executor.TF, idx, td)
            train_wl.extra_args += ['--min_mem']

            # create the foreground inference job
            wl, pipe = create_infer(Executor.TF, bs, td)
            wl.extra_args += ['--min_mem']

            run_tf(FLAGS.save_dir / "tfmps",
                   train_wl,  # start the background job
                   wl,  # start the foreground job
                   # wait for both jobs to be ready
                   RunFn(lambda *args, **kwargs: wait_on_pipe(pipetrain)),
                   RunFn(lambda *args, **kwargs: wait_on_pipe(pipe)),
                   # start train job
                   RunFn(lambda *args, **kwargs: release_on_pipe(pipetrain)),
                   # wait 10 seconds
                   Pause(10),
                   # release inference job
                   RunFn(lambda *args, **kwargs: release_on_pipe(pipe)),
                   # run_seq automatically join all jobs at the end of the sequence
                   )
예제 #3
0
def twoinfer_tfmps(argv):
    # type: (Sequence[str]) -> None
    batch_sizes = [int(v) for v in argv]

    if not batch_sizes:
        batch_sizes = [1, 2, 4, 8]

    for idx, bs in enumerate(batch_sizes):
        with tempfile.TemporaryDirectory() as td:
            # create the foreground inference job
            wl1, pipe1 = create_infer(Executor.TF, bs, td)
            wl1.extra_args += ['--min_mem']
            # create the foreground inference job
            wl2, pipe2 = create_infer(Executor.TF, bs, td)
            wl2.extra_args += ['--min_mem']

            run_tf(FLAGS.save_dir / "twoinfer" / "tfmps",
                   wl1,  # start the background job
                   wl2,  # start the foreground job
                   # wait for both jobs to be ready
                   RunFn(lambda *args, **kwargs: wait_on_pipe(pipe1)),
                   RunFn(lambda *args, **kwargs: wait_on_pipe(pipe2)),
                   # start train job
                   RunFn(lambda *args, **kwargs: release_on_pipe(pipe1)),
                   # release inference job
                   RunFn(lambda *args, **kwargs: release_on_pipe(pipe2)),
                   # run_seq automatically join all jobs at the end of the sequence
                   )
예제 #4
0
def tfmps2(argv):
    # type: (Sequence[str]) -> None
    name = "alexneteval"
    if len(argv) > 1:
        name = argv[0]
    batch_sizes = [int(v) for v in argv[1:]]

    if not batch_sizes:
        batch_sizes = [1, 2, 4, 8]

    batch_num = 300
    # batch_sizes = [1, 2, 4, 8, 16, 32]
    # batch_sizes = [1024, 1536, 2048, 4096]
    for idx, bs in enumerate(batch_sizes):
        with tempfile.TemporaryDirectory() as td:
            # create a background training job, the batch number has no effect here,
            # only used to distinguish different runs
            trainWl = WTL.create('inception4', 50, 100 + idx, executor=Executor.TF)
            # make sure it runs long enough
            trainWl.env['SALUS_ITER_SECONDS'] = '300'
            trainWl.extra_args += ['--min_mem']

            # create a pipe to signal trainWl
            pipetrain = str(pathlib.Path(td).joinpath('fifotrain'))
            os.mkfifo(pipetrain)
            trainWl.env['SALUS_WAIT_FOR_SIGNAL'] = pipetrain

            # create the foreground inference job
            wl = WTL.create(name, bs, batch_num, executor=Executor.TF)
            set_env(wl)
            wl.env['SALUS_ITER_SECONDS'] = '150'
            wl.extra_args += ['--min_mem']

            pipe = str(pathlib.Path(td).joinpath('fifo'))
            os.mkfifo(pipe)
            wl.env['SALUS_WAIT_FOR_SIGNAL'] = pipe

            run_tf(FLAGS.save_dir / "tfmps2" / (name + "-inception4"),
                   wl,  # start the foreground job
                   Pause(20),
                   trainWl,  # start the background job
                   # wait for both jobs to be ready
                   RunFn(lambda *args, **kwargs: wait_on_pipe(pipetrain)),
                   RunFn(lambda *args, **kwargs: wait_on_pipe(pipe)),
                   # start train job
                   RunFn(lambda *args, **kwargs: release_on_pipe(pipetrain)),
                   # wait 10 seconds
                   Pause(10),
                   # release inference job
                   RunFn(lambda *args, **kwargs: release_on_pipe(pipe)),
                   # run_seq automatically join all jobs at the end of the sequence
                   )
예제 #5
0
def diff(argv):
    # type: (Sequence[str]) -> None
    scfg = maybe_forced_preset(presets.MostEfficient)
    scfg.logconf = 'disable'

    # all non-integer argv are treated as names
    names = []
    batch_sizes = []
    for arg in argv:
        try:
            batch_sizes.append(int(arg))
        except ValueError:
            names.append(arg)

    # create jobs
    batch_num = 100
    # batch_sizes = [1, 2, 4, 8, 16, 32]
    # batch_sizes = [1024, 1536, 2048, 4096]
    for bs in batch_sizes:
        with tempfile.TemporaryDirectory() as td:
            wls = []
            pipes = []
            for name in names:
                if not name.endswith('eval'):
                    raise ValueError('Not an inference workload!!!')
                wl = WTL.create(name, bs, batch_num, executor=Executor.Salus)
                set_env(wl)
                wls.append(wl)

                # also add a small pause to make sure every job starts
                pipe = str(pathlib.Path(td).joinpath(wl.canonical_name).with_suffix('.pipe'))
                os.mkfifo(pipe)
                pipes.append(pipes)

            # wait all jobs to be ready
            wls.append(RunFn(lambda workloads, **kwargs: [wait_on_pipe(pipe) for pipe in pipes] and None))
            # signal all jobs to start
            wls.append(RunFn(lambda workloads, **kwargs: [release_on_pipe(pipe) for pipe in pipes] and None))

            run_seq(scfg.copy(output_dir=FLAGS.save_dir / '-'.join(names)),
                    *wls)
예제 #6
0
def same_pri_salus(argv):
    # type: (Sequence[str]) -> None
    """Inversed priority for training and inference"""
    base_cfg = maybe_forced_preset(presets.MostEfficient)

    sm_factors = [float(v) for v in argv]
    if not sm_factors:
        sm_factors = [1.0, 1.5, 2.0, 2.5, 3.0]

    for idx, factor in enumerate(sm_factors):
        scfg = base_cfg.copy(output_dir=FLAGS.save_dir / "same_pri" / f"{factor:.2f}")
        scfg.extra_args += [
            '--sm-factor', f'{factor:.2f}'
        ]
        with tempfile.TemporaryDirectory() as td:
            # create a background training job
            train_wl, pipetrain = create_train(Executor.Salus, 0, td)

            # create the foreground inference job
            wl, pipe = create_infer(Executor.Salus, 10, td)
            wl.extra_args += [
                '--eval_sched_priority', '20'
            ]

            run_seq(scfg,
                    train_wl,  # start the background job
                    wl,  # start the foreground job
                    # wait for both jobs to be ready
                    RunFn(lambda *args, **kwargs: wait_on_pipe(pipetrain)),
                    RunFn(lambda *args, **kwargs: wait_on_pipe(pipe)),
                    # start train job
                    RunFn(lambda *args, **kwargs: release_on_pipe(pipetrain)),
                    # wait 10 seconds
                    Pause(10),
                    # release inference job
                    RunFn(lambda *args, **kwargs: release_on_pipe(pipe)),
                    # run_seq automatically join all jobs at the end of the sequence
                    )
예제 #7
0
def train_alone(argv):
    """Run training workload alone take note of SM usage"""
    sm_factors = [float(v) for v in argv]
    if not sm_factors:
        sm_factors = [1.0, 1.5, 2.0, 2.5, 3.0]

    logger.info(f"Running Salus with sm factors: {sm_factors}")

    # run salus
    for factor in sm_factors:
        with tempfile.TemporaryDirectory() as td:
            scfg = maybe_forced_preset(presets.OpTracing)
            scfg.logconf = 'smtracing'
            scfg.extra_args += [
                '--sm-factor', f'{factor:.2f}'
            ]
            logger.info(f"Running Salus with sm factor: {factor}")
            # the background training job
            wl, pipe = create_train(Executor.Salus, 0, td)
            run_seq(scfg.copy(output_dir=FLAGS.save_dir / "alone" / f"{factor:.2f}"),
                    wl,
                    RunFn(lambda *args, **kwargs: wait_on_pipe(pipe)),
                    RunFn(lambda *args, **kwargs: release_on_pipe(pipe)))
예제 #8
0
def salus(argv):
    # type: (Sequence[str]) -> None
    scfg = maybe_forced_preset(presets.MostEfficient)

    name = "alexneteval"
    if len(argv) > 1:
        name = argv[0]
    batch_sizes = [int(v) for v in argv[1:]]

    if not batch_sizes:
        batch_sizes = [1, 2, 4, 8]

    batch_num = 300
    # batch_sizes = [1, 2, 4, 8, 16, 32]
    # batch_sizes = [1024, 1536, 2048, 4096]
    for idx, bs in enumerate(batch_sizes):
        with tempfile.TemporaryDirectory() as td:
            # create a background training job
            train_wl, pipetrain = create_train(Executor.Salus, idx, td)

            # create the foreground inference job
            wl, pipe = create_infer(Executor.Salus, name, bs, batch_num, td)

            run_seq(scfg.copy(output_dir=FLAGS.save_dir / "salus" / (name + "-inception4")),
                    train_wl,  # start the background job
                    wl,  # start the foreground job
                    # wait for both jobs to be ready
                    RunFn(lambda *args, **kwargs: wait_on_pipe(pipetrain)),
                    RunFn(lambda *args, **kwargs: wait_on_pipe(pipe)),
                    # start train job
                    RunFn(lambda *args, **kwargs: release_on_pipe(pipetrain)),
                    # wait 10 seconds
                    Pause(10),
                    # release inference job
                    RunFn(lambda *args, **kwargs: release_on_pipe(pipe)),
                    # run_seq automatically join all jobs at the end of the sequence
                    )
예제 #9
0
파일: bigrun.py 프로젝트: vycezhong/Salus-1
def main(argv):
    # type: (Sequence[str]) -> None
    scfg = maybe_forced_preset(presets.MostEfficient)
    scfg.scheduler = 'pack'

    cases = (Cases[c] for c in argv) if argv else Cases
    templates = list(gen_workload_list(FLAGS.select_wl))
    if FLAGS.total_num > 0:
        templates = templates[:FLAGS.total_num]

    logger.info("Selected the following list of workloads")
    for wtl, rcfg in templates:
        logger.info(f"    {wtl.canonical_name(rcfg)} of {rcfg.batch_num} iters")

    # Check if workloads have the info we need
    for wtl, rcfg in templates:
        for field in ['jct', 'persistmem']:
            if wtl.geometry(rcfg, Executor.Salus)[field] is None:
                raise ValueError(f'Missing {field} data for workload {wtl.canonical_name(rcfg)} of {rcfg.batch_num} iters, available geometries: {wtl._geometries}')

    for case in cases:
        logdir = FLAGS.save_dir / case.name

        # create workload instances
        workloads = (wtl._create_from_rcfg(rcfg, Executor.Salus) for wtl, rcfg in templates)
        # sort workload according to case
        key, desc = case.value
        workloads = sorted(workloads, key=lambda w: w.geometry[key], reverse=desc)

        def limit_concurrent(wls):
            # type: (Iterable[Workload]) -> None
            """Wait for something to finish"""
            gone, alive = SalusServer.wait_workloads(wls, timeout=0)
            while len(alive) >= FLAGS.concurrent_jobs:
                gone, alive = SalusServer.wait_workloads(wls, timeout=0)
                time.sleep(.25)

        actions = chain(*(
            [w, RunFn(limit_concurrent)]
            for w in workloads
        ))

        run_seq(scfg.copy(output_dir=logdir), *actions)