Example #1
0
def hydra_main(cfg: DictConfig) -> None:
    # Set up python logging.
    logger = logging.getLogger()
    logging.basicConfig(format='%(asctime)s %(levelname)-8s %(message)s',
                        level=cfg.log_level,
                        datefmt='%Y-%m-%d %H:%M:%S')
    logging.info(cfg.pretty())
    if 'slurm' in cfg.train:
        slurm_dir = Path.cwd() / 'slurm'
        slurm_dir.mkdir()
        executor = submitit.AutoExecutor(slurm_dir)
        executor.update_parameters(
            slurm_gpus_per_node=cfg.train.slurm.gpus_per_node,
            slurm_nodes=cfg.train.slurm.nodes,
            slurm_ntasks_per_node=cfg.train.slurm.gpus_per_node,
            slurm_cpus_per_task=cfg.train.slurm.cpus_per_task,
            slurm_time=cfg.train.slurm.time,
            slurm_additional_parameters={
                'constraint': 'gpu',
                'account': cfg.train.slurm.account
            })
        job = executor.submit(train, cfg=cfg)
        logging.info(f'submitted job {job.job_id}.')
    else:
        train(cfg)
Example #2
0
def main(args):
    if not os.path.exists('./results'):
        os.mkdir('./results')
    run_IDs = list(range(20))
    errors = {r: None for r in run_IDs}
    jobs = {r: None for r in run_IDs}

    # initialize job executor
    executor = submitit.AutoExecutor(folder="./logs")
    executor.update_parameters(nodes=1,
                               tasks_per_node=1,
                               cpus_per_task=3,
                               slurm_mem='20GB',
                               slurm_gres='gpu:1',
                               slurm_time='8:00:00',
                               slurm_job_name='osc',
                               slurm_array_parallelism=20)

    # execute 3-step process sequentially
    print('step 1: parsing')
    fn = lambda r: get_base_parses(r, reverse=args.reverse)
    jobs, errors = array_step(executor, fn, jobs, run_IDs, errors)
    save_errors(errors)

    print('step 2: optimization')
    fn = lambda r: optimize_parses(r, reverse=args.reverse)
    jobs, errors = array_step(executor, fn, jobs, run_IDs, errors)
    save_errors(errors)

    print('step 3: re-fitting')
    executor.update_parameters(
        slurm_time='48:00:00')  # more compute time needed for this step
    fn = lambda r: refit_parses_multi(r, reverse=args.reverse)
    jobs, errors = array_step(executor, fn, jobs, run_IDs, errors)
    save_errors(errors)
Example #3
0
def launch():
    executor = submitit.AutoExecutor(folder=args.folder)
    executor.update_parameters(
        slurm_partition=args.partition,
        slurm_constraint=args.device,
        slurm_comment='comms release April 30',
        slurm_mem='450G',
        timeout_min=args.time,
        nodes=args.nodes,
        tasks_per_node=args.tasks_per_node,
        cpus_per_task=10,
        gpus_per_node=args.tasks_per_node)

    config_fnames = [args.fname]
    if args.batch_launch:
        with open(args.fname, 'r') as y_file:
            config_fnames = yaml.load(y_file, Loader=yaml.FullLoader)

    jobs, trainers = [], []
    with executor.batch():
        for cf in config_fnames:
            fb_trainer = Trainer(args.sel, cf)
            job = executor.submit(fb_trainer,)
            trainers.append(fb_trainer)
            jobs.append(job)

    for job in jobs:
        print(job.job_id)
Example #4
0
def launch_benchmark_suite_scheduler(config_file):
    assert g_pathmgr.exists(
        config_file), "Slurm evaluator config file must exist"

    user_config = load_file(config_file)
    config = _DEFAULT_CONFIG.copy()
    recursive_dict_merge(config, user_config)

    benchmark_suite_scheduler = BenchmarkSuiteScheduler(**config["params"])
    benchmark_suite_scheduler_job = SlurmEvaluatorJob(
        benchmark_suite_scheduler=benchmark_suite_scheduler)
    executor = submitit.AutoExecutor(
        folder=benchmark_suite_scheduler.evaluation_dir())

    assert "slurm_options" in config, "slurm_options must be specified"
    assert (
        "PARTITION" in config["slurm_options"]
    ), "slurm_options.PARTITION is a required field to launch the benchmark suite on slurm"

    slurm_options = AttrDict(config["slurm_options"])
    executor.update_parameters(
        name=slurm_options.NAME,
        slurm_comment=slurm_options.COMMENT,
        slurm_partition=slurm_options.PARTITION,
        slurm_constraint=slurm_options.CONSTRAINT,
        timeout_min=slurm_options.TIMEOUT_MIN,
        nodes=1,
        cpus_per_task=slurm_options.CPUS_PER_TASK,
        tasks_per_node=1,
        mem_gb=slurm_options.MEM_GB,
        slurm_additional_parameters=slurm_options.ADDITIONAL_PARAMETERS,
    )

    job = executor.submit(benchmark_suite_scheduler_job)
    print(f"SUBMITTED EVALUATION JOB: {job.job_id}")
Example #5
0
def main():
    args = opts.parse_args()
    # Note that the folder will depend on the job_id, to easily track experiments
    executor = submitit.AutoExecutor(
        folder=utils.get_shared_folder(args.name) / "%j")
    num_gpus_per_node = 8
    args.batch_size = args.batch_size * num_gpus_per_node
    executor.update_parameters(
        mem_gb=45 * num_gpus_per_node,
        gpus_per_node=num_gpus_per_node,
        # tasks_per_node=1,  # one task per GPU
        cpus_per_task=80,
        nodes=1,
        timeout_min=60 * 16,
        # Below are cluster dependent parameters
        slurm_partition="dev",
        slurm_signal_delay_s=120,
    )

    executor.update_parameters(name=args.name)

    args.dist_url = utils.get_init_file(args.name).as_uri()
    args.output_dir = str(utils.get_shared_folder(args.name))
    trainer = Trainer(args)
    job = executor.submit(trainer)
Example #6
0
    def launch(
        self, job_overrides: Sequence[Sequence[str]], initial_job_idx: int
    ) -> Sequence[JobReturn]:
        # lazy import to ensure plugin discovery remains fast
        import submitit

        num_jobs = len(job_overrides)
        assert num_jobs > 0
        params = self.params
        # build executor
        init_params = {"folder": self.params["submitit_folder"]}
        specific_init_keys = {"max_num_timeout"}

        init_params.update(
            **{
                f"{self._EXECUTOR}_{x}": y
                for x, y in params.items()
                if x in specific_init_keys
            }
        )
        init_keys = specific_init_keys | {"submitit_folder"}
        executor = submitit.AutoExecutor(cluster=self._EXECUTOR, **init_params)

        # specify resources/parameters
        baseparams = set(dataclasses.asdict(BaseTarget()).keys())
        params = {
            x if x in baseparams else f"{self._EXECUTOR}_{x}": y
            for x, y in params.items()
            if x not in init_keys
        }
        executor.update_parameters(**params)

        log.info(
            f"Submitit '{self._EXECUTOR}' sweep output dir : "
            f"{self.config.hydra.sweep.dir}"
        )
        sweep_dir = Path(str(self.config.hydra.sweep.dir))
        sweep_dir.mkdir(parents=True, exist_ok=True)
        if "mode" in self.config.hydra.sweep:
            mode = int(str(self.config.hydra.sweep.mode), 8)
            os.chmod(sweep_dir, mode=mode)

        params = []

        for idx, overrides in enumerate(job_overrides):
            idx = initial_job_idx + idx
            lst = " ".join(filter_overrides(overrides))
            log.info(f"\t#{idx} : {lst}")
            params.append(
                (
                    list(overrides),
                    "hydra.sweep.dir",
                    idx,
                    f"job_id_for_{idx}",
                    Singleton.get_state(),
                )
            )

        jobs = executor.map_array(self, *zip(*params))
        return [j.results()[0] for j in jobs]
Example #7
0
def init_executor(executor, args):
    log_folder = f"{args.log_dir}/%j"
    executor = submitit.AutoExecutor(folder=log_folder)
    executor.update_parameters(timeout_min=4,
                               slurm_partition="dev",
                               gpus_per_node=args.ngpus)
    return executor
Example #8
0
def main():
    args = parse_args()
    if args.job_dir == "":
        args.job_dir = get_shared_folder() / "%j"

    # Note that the folder will depend on the job_id, to easily track experiments
    executor = submitit.AutoExecutor(folder=args.job_dir,
                                     slurm_max_num_timeout=30)

    # cluster setup is defined by environment variables
    num_gpus_per_node = args.ngpus
    nodes = args.nodes
    timeout_min = args.timeout

    executor.update_parameters(
        mem_gb=40 * num_gpus_per_node,
        gpus_per_node=num_gpus_per_node,
        tasks_per_node=num_gpus_per_node,  # one task per GPU
        cpus_per_task=10,
        nodes=nodes,
        timeout_min=timeout_min,  # max is 60 * 72
    )

    executor.update_parameters(name="detr")

    args.dist_url = get_init_file().as_uri()
    args.output_dir = args.job_dir

    trainer = Trainer(args)
    job = executor.submit(trainer)

    print("Submitted job_id:", job.job_id)
Example #9
0
def main(args):

    files = Path(args.alignment_dir).glob("*.a3m")
    output_dir = Path(args.output_dir)

    def commands():
        for file in files:
            base_command = [
                "bash",
                "run_training_in_conda_env.sh",
                str(file),
                str(output_dir),
            ]
            yield base_command

    executor = submitit.AutoExecutor(
        folder=f"/checkpoint/{os.environ['USER']}/deepsequence-timing-logs")
    executor.update_parameters(
        timeout_min=3000,
        slurm_partition="learnfair",
        gpus_per_node=1,
        mem_gb=64,
        cpus_per_task=10,
        slurm_constraint="volta32gb",
        slurm_array_parallelism=32,
    )

    runfunc = partial(timed_run, output_dir=output_dir)
    with executor.batch():
        for command in commands:
            executor.submit(runfunc, command)
Example #10
0
def create_submitit_executor(cfg: AttrDict):
    """
    Utility function to create a SLURM submitit executor, which
    is able to schedule arbitrary functions on a SLURM cluster

    The configuration of the executor is derived from the SLURM part
    of the VISSL configuration provided as parameter
    """
    import submitit

    log_folder = cfg.SLURM.LOG_FOLDER
    makedir(log_folder)
    assert g_pathmgr.exists(
        log_folder
    ), f"Specified config.SLURM.LOG_FOLDER={log_folder} doesn't exist"
    assert cfg.SLURM.PARTITION, "SLURM.PARTITION must be set when using SLURM"

    executor = submitit.AutoExecutor(folder=log_folder)
    timeout_min = cfg.SLURM.TIME_HOURS * 60 + cfg.SLURM.TIME_MINUTES
    executor.update_parameters(
        name=cfg.SLURM.NAME,
        slurm_comment=cfg.SLURM.COMMENT,
        slurm_partition=cfg.SLURM.PARTITION,
        slurm_constraint=cfg.SLURM.CONSTRAINT,
        timeout_min=timeout_min,
        nodes=cfg.DISTRIBUTED.NUM_NODES,
        cpus_per_task=cfg.SLURM.NUM_CPU_PER_PROC *
        cfg.DISTRIBUTED.NUM_PROC_PER_NODE,
        tasks_per_node=1,
        gpus_per_node=cfg.DISTRIBUTED.NUM_PROC_PER_NODE,
        mem_gb=cfg.SLURM.MEM_GB,
        slurm_additional_parameters=cfg.SLURM.ADDITIONAL_PARAMETERS,
    )
    return executor
Example #11
0
def hydra_main(cfg: DictConfig) -> None:
    # Set up python logging.
    logger = logging.getLogger()
    if is_rank_zero():
        logger.setLevel(cfg.log_level)
        logging.info(OmegaConf.to_yaml(cfg))
        wandb_version = wandb.util.generate_id()
        add_wandb_version(cfg, wandb_version)
    if cfg.cluster.name == 'slurm':
        slurm_dir = Path.cwd() / 'slurm'
        slurm_dir.mkdir()
        logging.info(f'Slurm logs: {slurm_dir}')
        executor = submitit.AutoExecutor(slurm_dir)
        executor.update_parameters(
            slurm_gpus_per_node=cfg.cluster.gpus_per_node,
            slurm_nodes=cfg.cluster.nodes,
            slurm_ntasks_per_node=cfg.cluster.gpus_per_node,
            slurm_cpus_per_task=cfg.cluster.cpus_per_task,
            slurm_time=cfg.cluster.time,
            slurm_additional_parameters={
                'constraint': 'gpu',
                'account': cfg.cluster.account,
                'requeue': True
            })
        job = executor.submit(train, cfg=cfg)
        logging.info(f'submitted job {job.job_id}.')
    else:
        train(cfg)
Example #12
0
def main():
    args = parse()
    out_sents = []
    with open(args.data_path, "r") as fp:
        sent_list = [x.strip() for x in fp.readlines()]
    if args.parallel_process_num > 1:
        try:
            import submitit
        except ImportError:
            logger.warn(
                "submitit is not found and only one job is used to process the data"
            )
            submitit = None

    if args.parallel_process_num == 1 or submitit is None:
        out_sents = process_sents(sent_list, args)
    else:
        # process sentences with parallel computation
        lsize = len(sent_list) // args.parallel_process_num + 1
        executor = submitit.AutoExecutor(folder=args.logdir)
        executor.update_parameters(timeout_min=1000, cpus_per_task=4)
        jobs = []
        for i in range(args.parallel_process_num):
            job = executor.submit(process_sents,
                                  sent_list[lsize * i:lsize * (i + 1)], args)
            jobs.append(job)
        is_running = True
        while is_running:
            time.sleep(5)
            is_running = sum([job.done() for job in jobs]) < len(jobs)
        out_sents = list(
            itertools.chain.from_iterable([job.result() for job in jobs]))
    with open(args.out_path, "w") as fp:
        fp.write("\n".join(out_sents) + "\n")
Example #13
0
def main():
    t0 = time.time()
    # Cleanup log folder.
    # This folder may grow rapidly especially if you have large checkpoints,
    # or submit lot of jobs. You should think about an automated way of cleaning it.
    folder = Path(__file__).parent / "mnist_logs"
    if folder.exists():
        for file in folder.iterdir():
            file.unlink()

    ex = submitit.AutoExecutor(folder)
    if ex.cluster == "slurm":
        print("Executor will schedule jobs on Slurm.")
    else:
        print(
            f"!!! Slurm executable `srun` not found. Will execute jobs on '{ex.cluster}'"
        )

    model_path = folder / "model.pkl"
    trainer = MnistTrainer(
        LogisticRegression(penalty="l1",
                           solver="saga",
                           tol=0.1,
                           multi_class="auto"))

    # Specify the job requirements.
    # Reserving only as much resource as you need ensure the cluster resource are
    # efficiently allocated.
    ex.update_parameters(mem_gb=1, cpus_per_task=4, timeout_min=5)
    job = ex.submit(trainer, 5000, model_path=model_path)

    print(f"Scheduled {job}.")

    # Wait for the job to be running.
    while job.state != "RUNNING":
        time.sleep(1)

    print("Run the following command to see what's happening")
    print(f"  less +F {job.paths.stdout}")

    # Simulate preemption.
    # Tries to stop the job after the first stage.
    # If the job is preempted before the end of the first stage, try to increase it.
    # If the job is not preempted, try to decrease it.
    time.sleep(25)
    print(f"preempting {job} after {time.time() - t0:.0f}s")
    job._interrupt()

    score = job.result()
    print(f"Finished training. Final score: {score}.")
    print(f"---------------- Job output ---------------------")
    print(job.stdout())
    print(f"-------------------------------------------------")

    assert model_path.exists()
    with open(model_path, "rb") as f:
        (scaler, clf) = pickle.load(f)
    sparsity = np.mean(clf.coef_ == 0) * 100
    print(f"Sparsity with L1 penalty: {sparsity / 100:.2%}")
Example #14
0
def test_slurm_through_auto(params: tp.Dict[str, int], tmp_path: Path) -> None:
    with mocked_slurm():
        executor = submitit.AutoExecutor(folder=tmp_path)
        executor.update_parameters(**params, slurm_additional_parameters={"mem_per_gpu": 12})
        job = executor.submit(test_core.do_nothing, 1, 2, blublu=3)
    text = job.paths.submission_file.read_text()
    mem_lines = [x for x in text.splitlines() if "#SBATCH --mem" in x]
    assert len(mem_lines) == 1, f"Unexpected lines: {mem_lines}"
Example #15
0
    def launch(
        self, job_overrides: Sequence[Sequence[str]], initial_job_idx: int
    ) -> Sequence[JobReturn]:
        # lazy import to ensure plugin discovery remains fast
        import submitit

        num_jobs = len(job_overrides)
        assert num_jobs > 0

        # make sure you don't change inplace
        queue_parameters = self.queue_parameters.copy()
        OmegaConf.set_struct(queue_parameters, True)
        if self.queue == "auto":
            max_num_timeout = self.queue_parameters.auto.max_num_timeout
            with open_dict(queue_parameters):
                del queue_parameters.auto["max_num_timeout"]
            executor = submitit.AutoExecutor(
                folder=self.folder, max_num_timeout=max_num_timeout
            )
        elif self.queue == "slurm":
            max_num_timeout = self.queue_parameters.slurm.max_num_timeout
            with open_dict(queue_parameters):
                del queue_parameters.slurm["max_num_timeout"]
            executor = submitit.SlurmExecutor(
                folder=self.folder, max_num_timeout=max_num_timeout
            )
        elif self.queue == "local":
            executor = submitit.LocalExecutor(folder=self.folder)
        else:
            raise RuntimeError("Unsupported queue type {}".format(self.queue))

        executor.update_parameters(**queue_parameters[self.queue])

        log.info("Sweep output dir : {}".format(self.config.hydra.sweep.dir))
        sweep_dir = Path(str(self.config.hydra.sweep.dir))
        sweep_dir.mkdir(parents=True, exist_ok=True)
        if "mode" in self.config.hydra.sweep:
            mode = int(str(self.config.hydra.sweep.mode), 8)
            os.chmod(sweep_dir, mode=mode)

        params = []

        for idx, overrides in enumerate(job_overrides):
            idx = initial_job_idx + idx
            lst = " ".join(filter_overrides(overrides))
            log.info(f"\t#{idx} : {lst}")
            params.append(
                (
                    list(overrides),
                    "hydra.sweep.dir",
                    idx,
                    f"job_id_for_{idx}",
                    Singleton.get_state(),
                )
            )

        jobs = executor.map_array(self, *zip(*params))
        return [j.results()[0] for j in jobs]
Example #16
0
def get_executor(job_name,
                 timeout_hour=60,
                 n_gpus=1,
                 project='fastmri',
                 no_force_32=False,
                 torch=False):
    executor = submitit.AutoExecutor(folder=job_name)
    if timeout_hour > 20:
        qos = 't4'
    elif timeout_hour > 2:
        qos = 't3'
    else:
        qos = 'dev'
    multi_node = n_gpus > 8
    if multi_node:
        assert n_gpus % 4 == 0, 'Use multiple of 4 GPUs for multi node training'
        assert timeout_hour <= 20, 'Use t3 qos for multi node training'
        multi_node = True
        n_nodes = n_gpus // 4
        n_gpus = n_gpus // n_nodes
    cpu_per_gpu = 3 if n_gpus > 4 else 10
    tasks_per_node = 1
    cpus_per_task = cpu_per_gpu * n_gpus
    slurm_params = {
        'ntasks-per-node': tasks_per_node,
        'cpus-per-task': cpus_per_task,
        'account': 'hih@gpu',
        'qos': f'qos_gpu-{qos}',
        'distribution': 'block:block',
        'hint': 'nomultithread',
    }
    slurm_setup = [
        '#SBATCH -C v100-32g',
        'cd $WORK/submission-scripts/jean_zay/env_configs',
        f'. {project}.sh',
    ]
    if n_gpus > 4 and n_gpus < 8:
        slurm_params.update({'partition': 'gpu_p2'})
    if (n_gpus > 4 or no_force_32) and n_gpus < 8:
        slurm_setup = slurm_setup[1:]
    if multi_node:
        slurm_params.update({
            'nodes': n_nodes,
        })
        slurm_setup.append(
            'unset http_proxy https_proxy HTTP_PROXY HTTPS_PROXY')
    executor.update_parameters(
        timeout_min=60,
        tasks_per_node=tasks_per_node,
        cpus_per_task=cpus_per_task,
        slurm_job_name=job_name,
        slurm_time=f'{timeout_hour}:00:00',
        slurm_gres=f'gpu:{n_gpus}',
        slurm_additional_parameters=slurm_params,
        slurm_setup=slurm_setup,
    )
    return executor
Example #17
0
def get_executor(local=False, batch=None):
    if local:
        return submitit.LocalExecutor(folder="/tmp/submitit-logs")

    executor = submitit.AutoExecutor(folder="/checkpoint/jjgo/submitit-logs/")
    if batch is not None:
        assert isinstance(batch, int)
        executor.update_parameters(slurm_array_parallelism=batch)

    return executor
Example #18
0
def get_submitit_executor(n_jobs=10, comment="", partition='learnfair'):
    if not is_submitit_available():
        raise Exception('Submitit Not installed')
    executor = submitit.AutoExecutor(folder='PAQ_embedding_jobs')
    executor.update_parameters(timeout_min=120,
                               slurm_partition=partition,
                               slurm_nodes=1,
                               slurm_ntasks_per_node=1,
                               slurm_cpus_per_task=10,
                               slurm_constraint='volta32gb',
                               slurm_gpus_per_node='volta:1',
                               slurm_array_parallelism=n_jobs,
                               slurm_comment=comment,
                               slurm_mem='64G')
    return executor
Example #19
0
def hydra_main(cfg: DictConfig) -> None:
    # Set up python logging.
    logger = logging.getLogger()
    logger.setLevel(cfg.log_level)
    logging.info(cfg.pretty())
    if 'slurm' in cfg.train:
        slurm_dir = Path.cwd() / 'slurm'
        slurm_dir.mkdir()
        executor = submitit.AutoExecutor(slurm_dir)
        executor.update_parameters(slurm_gpus_per_node=cfg.train.slurm.gpus_per_node, slurm_nodes=cfg.train.slurm.nodes, slurm_ntasks_per_node=cfg.train.slurm.gpus_per_node,
                                   slurm_cpus_per_task=cfg.train.slurm.cpus_per_task, slurm_time=cfg.train.slurm.time, slurm_additional_parameters={'constraint': 'gpu', 'account': cfg.train.slurm.account})
        job = executor.submit(train, cfg=cfg, output_dir=Path.cwd())
        logging.info(f'submitted job {job.job_id}.')
    else:
        train(cfg)
def main():
    args = parse_args()
    if args.job_dir == "":
        args.job_dir = get_shared_folder(args) / "%j"

    # Note that the folder will depend on the job_id, to easily track experimen`ts
    executor = submitit.AutoExecutor(folder=args.job_dir,
                                     slurm_max_num_timeout=30)

    # cluster setup is defined by environment variables
    num_gpus_per_node = args.ngpus
    nodes = args.nodes
    timeout_min = args.timeout
    kwargs = {}
    if args.use_volta32:
        kwargs['constraint'] = 'volta32gb'
    if args.comment:
        kwargs['comment'] = args.comment

    executor.update_parameters(
        mem_gb=40 * num_gpus_per_node,
        tasks_per_node=num_gpus_per_node,  # one task per GPU
        cpus_per_task=10,
        nodes=nodes,
        timeout_min=10080,  # max is 60 * 72
        # Below are cluster dependent parameters
        slurm_gres=
        f"gpu:rtx8000:{num_gpus_per_node}",  #you can choose to comment this, or change it to v100 as per your need
        slurm_signal_delay_s=120,
        **kwargs)

    executor.update_parameters(name="detectransformer")
    if args.mail:
        executor.update_parameters(additional_parameters={
            'mail-user': args.mail,
            'mail-type': 'END'
        })

    executor.update_parameters(
        slurm_additional_parameters={'gres-flags': 'enforce-binding'})

    args.dist_url = get_init_file(args).as_uri()
    args.output_dir = args.job_dir

    trainer = Trainer(args)
    job = executor.submit(trainer)

    print("Submitted job_id:", job.job_id)
Example #21
0
def main():
    args = parse_args()
    if args.job_dir == "":
        args.job_dir = get_shared_folder() / "%j"

    # Note that the folder will depend on the job_id, to easily track experiments
    executor = submitit.AutoExecutor(folder=args.job_dir, max_num_timeout=30)
    # executor = submitit.LocalExecutor(folder=get_shared_folder() / "%j")

    # cluster setup is defined by environment variables
    num_gpus_per_node = args.ngpus
    nodes = args.nodes
    partition = args.partition
    timeout_min = args.timeout
    kwargs = {}
    if args.use_volta32:
        kwargs["constraint"] = "volta32gb"
    if args.comment:
        kwargs["comment"] = args.comment

    executor.update_parameters(
        mem_gb=40 * num_gpus_per_node,
        gpus_per_node=num_gpus_per_node,
        tasks_per_node=num_gpus_per_node,  # one task per GPU
        cpus_per_task=10,
        nodes=nodes,
        timeout_min=timeout_min,  # max is 60 * 72
        # Below are cluster dependent parameters
        hostgroup="fblearner_ash_bigsur_fair",
        partition=partition,
        signal_delay_s=120,
        **kwargs,
    )

    executor.update_parameters(name="detectransformer")
    if args.mail:
        executor.update_parameters(additional_parameters={
            "mail-user": args.mail,
            "mail-type": "END"
        })

    args.dist_url = get_init_file().as_uri()
    args.output_dir = args.job_dir

    trainer = Trainer(args)
    job = executor.submit(trainer)

    print("Submitted job_id:", job.job_id)
Example #22
0
def get_slurm_executor(slurm_config, timeout=100, job_name="benchopt_run"):

    with open(slurm_config, "r") as f:
        config = yaml.safe_load(f)

    # If the job timeout is not specified in the config file, use 1.5x the
    # benchopt timeout. This value is a trade-off between helping the
    # scheduler (low slurm_time allow for faster accept) and avoiding
    # killing the job too early.
    if 'slurm_time' not in config:
        # Timeout is in second in benchopt
        config['slurm_time'] = f"00:{int(1.5*timeout)}"

    executor = submitit.AutoExecutor(job_name)
    executor.update_parameters(**config)
    return executor
    def set_it_up(self, experiment_directory):
        # create the submitit executor for creating and managing jobs
        executor = submitit.AutoExecutor(
            folder=os.path.join(experiment_directory, "Logs"))

        # setup the executor parameters based on the cluster location
        if executor.cluster == "slurm":
            executor.update_parameters(
                mem_gb=8,
                cpus_per_task=4,
                timeout_min=1000,
                tasks_per_node=1,
                nodes=1,
                slurm_partition="long",
                #gres="gpu:rtx8000:1",
            )
        return executor
Example #24
0
def main():
    args = parse_args()

    # Note that the folder will depend on the job_id, to easily track experiments
    if args.job_dir == "":
        args.job_dir = get_shared_folder() / "%j"

    executor = submitit.AutoExecutor(
        folder=args.job_dir, cluster=args.cluster, slurm_max_num_timeout=30)

    # cluster setup is defined by environment variables
    num_gpus_per_node = args.num_gpus
    nodes = args.nodes
    timeout_min = args.timeout

    if args.slurm_gres:
        slurm_gres = args.slurm_gres
    else:
        slurm_gres = f'gpu:{num_gpus_per_node},VRAM:{args.vram}'

    executor.update_parameters(
        mem_gb=args.mem_per_gpu * num_gpus_per_node,
        # gpus_per_node=num_gpus_per_node,
        tasks_per_node=num_gpus_per_node,  # one task per GPU
        cpus_per_task=2,
        nodes=nodes,
        timeout_min=timeout_min,
        slurm_partition=args.slurm_partition,
        slurm_constraint=args.slurm_constraint,
        slurm_comment=args.slurm_comment,
        slurm_exclude=args.slurm_exclude,
        slurm_gres=slurm_gres
    )

    executor.update_parameters(name="fair_track")

    args.dist_url = get_init_file().as_uri()
    # args.output_dir = args.job_dir

    trainer = Trainer(args)
    job = executor.submit(trainer)

    print("Submitted job_id:", job.job_id)

    if args.cluster == 'debug':
        job.wait()
Example #25
0
def main():
    args = parse_args()

    if args.name == "":
        cfg_name = os.path.splitext(os.path.basename(args.cfg_file))[0]
        args.name = '_'.join([cfg_name, args.postfix])

    assert args.job_dir != ""

    args.output_dir = str(args.job_dir)
    args.job_dir = Path(args.job_dir) / "%j"

    # Note that the folder will depend on the job_id, to easily track experiments
    #executor = submitit.AutoExecutor(folder=Path(args.job_dir) / "%j", slurm_max_num_timeout=30)
    executor = submitit.AutoExecutor(folder=args.job_dir,
                                     slurm_max_num_timeout=30)

    # cluster setup is defined by environment variables
    num_gpus_per_node = args.num_gpus
    nodes = args.num_shards
    partition = args.partition
    timeout_min = args.timeout
    kwargs = {}
    if args.use_volta32:
        kwargs['slurm_constraint'] = 'volta32gb,ib4'
    if args.comment:
        kwargs['slurm_comment'] = args.comment

    executor.update_parameters(
        mem_gb=60 * num_gpus_per_node,
        gpus_per_node=num_gpus_per_node,
        tasks_per_node=1,
        cpus_per_task=10 * num_gpus_per_node,
        nodes=nodes,
        timeout_min=timeout_min,  # max is 60 * 72
        slurm_partition=partition,
        slurm_signal_delay_s=120,
        **kwargs)

    print(args.name)
    executor.update_parameters(name=args.name)

    trainer = Trainer(args)
    job = executor.submit(trainer)

    print("Submitted job_id:", job.job_id)
def cli(
    num_tables,
    num_embeddings,
    embedding_dim,
    batch_size,
    bag_size,
    iters,
    remote,
    fp16,
    managed,
    mixed,
):
    def f():
        import torch

        benchmark_forward(
            batch_size,
            num_embeddings,
            num_tables,
            bag_size,
            embedding_dim,
            iters,
            fp16,
            managed,
            mixed,
        )

    if remote:
        import submitit

        executor = submitit.AutoExecutor(folder="sparse_embedding_perf")
        executor.update_parameters(
            timeout_min=10, partition="dev", constraint="volta32gb", gpus_per_node=1
        )
        job = executor.submit(f)
        job.wait()
        job.result()
        logging.info("Finished")
        import time

        time.sleep(1)
        print(job.stdout())
        print(job.stderr(), file=sys.stderr)
        logging.info("Finished")
    else:
        f()
def main():
    args = parse_args()
    if args.job_dir == "":
        args.job_dir = get_shared_folder() / "%j"

    # Note that the folder will depend on the job_id, to easily track experiments
    executor = submitit.AutoExecutor(folder=args.job_dir,
                                     slurm_max_num_timeout=30)

    num_gpus_per_node = args.ngpus
    nodes = args.nodes
    timeout_min = args.timeout

    partition = args.partition
    kwargs = {}
    if args.use_volta32:
        kwargs['slurm_constraint'] = 'volta32gb'
    if args.comment:
        kwargs['slurm_comment'] = args.comment

    executor.update_parameters(
        # mem_gb=40 * num_gpus_per_node,
        # gpus_per_node=num_gpus_per_node,
        tasks_per_node=num_gpus_per_node,  # one task per GPU
        # cpus_per_task=10,
        nodes=nodes,
        timeout_min=60 * 24 * 10,  # max is 60 * 72
        # Below are cluster dependent parameters
        slurm_gres="gpu:%d" % num_gpus_per_node,
        slurm_partition=partition,
        slurm_signal_delay_s=120,
        slurm_additional_parameters={
            'qos': 'non-preemptable',
            'mpi': 'pmi2'
        },
        **kwargs)

    executor.update_parameters(name="deit")

    args.dist_url = get_init_file().as_uri()
    args.output_dir = args.job_dir

    trainer = Trainer(args)
    job = executor.submit(trainer)

    print("Submitted job_id:", job.job_id)
def main():
    args = parse_args()
    if args.job_dir == "":
        args.job_dir = get_shared_folder(args) / "%j"

    # Note that the folder will depend on the job_id, to easily track experiments

    executor = submitit.AutoExecutor(folder=args.job_dir,
                                     slurm_max_num_timeout=30)

    # cluster setup is defined by environment variables
    num_gpus_per_node = args.ngpus
    nodes = args.nodes
    partition = args.partition
    timeout_min = args.timeout
    kwargs = {}
    if partition is not None:
        kwargs["slurm_partition"] = partition

    executor.update_parameters(
        mem_gb=62 * num_gpus_per_node,
        gpus_per_node=num_gpus_per_node,
        tasks_per_node=num_gpus_per_node,  # one task per GPU
        cpus_per_task=10,
        nodes=nodes,
        timeout_min=timeout_min,  # max is 60 * 72
        # Below are cluster dependent parameters
        slurm_signal_delay_s=120,
        **kwargs,
    )

    executor.update_parameters(name="detectransformer")
    if args.mail:
        executor.update_parameters(additional_parameters={
            "mail-user": args.mail,
            "mail-type": "END"
        })

    args.dist_url = get_init_file(args).as_uri()
    args.output_dir = args.job_dir

    trainer = Trainer(args)
    job = executor.submit(trainer)

    print("Submitted job_id:", job.job_id)
Example #29
0
def main():
    args = parse_args()
    if args.job_dir == '':
        args.job_dir = get_shared_folder() / '%j'
    executor = submitit.AutoExecutor(folder=args.job_dir,
        slurm_max_num_timeout=30)
    num_gpus_per_node = args.ngpus
    nodes = args.nodes
    timeout_min = args.timeout
    executor.update_parameters(mem_gb=40 * num_gpus_per_node, gpus_per_node
        =num_gpus_per_node, tasks_per_node=num_gpus_per_node, cpus_per_task
        =10, nodes=nodes, timeout_min=timeout_min)
    executor.update_parameters(name='detr')
    args.dist_url = get_init_file().as_uri()
    args.output_dir = args.job_dir
    trainer = Trainer(args)
    job = executor.submit(trainer)
    print('Submitted job_id:', job.job_id)
Example #30
0
def cli(num_tables, num_embeddings, embedding_dim, dense_features_dim,
        batch_size, bag_size, iters, remote):
    def f():
        benchmark_torch_snn_forward("dlrm", num_tables, num_embeddings,
                                    embedding_dim, dense_features_dim,
                                    batch_size, bag_size, iters)
        benchmark_torch_uniform_snn_forward("fused", num_tables,
                                            num_embeddings, embedding_dim,
                                            dense_features_dim, batch_size,
                                            bag_size, iters)
        benchmark_torch_uniform_snn_forward("fused-fp16", num_tables,
                                            num_embeddings, embedding_dim,
                                            dense_features_dim, batch_size,
                                            bag_size, iters, fp16=1)

        # benchmark_torch_uniform_snn_forward("fused",
        #                                     num_tables,
        #                                     num_embeddings,
        #                                     embedding_dim,
        #                                     dense_features_dim,
        #                                     batch_size,
        #                                     bag_size,
        #                                     iters,
        #                                     fp16=1)

    if remote:
        import submitit
        import sys
        executor = submitit.AutoExecutor(folder="dlrm_perf")
        executor.update_parameters(timeout_min=10,
                                   partition="dev",
                                   constraint="volta",
                                   gpus_per_node=1)
        job = executor.submit(f)
        job.wait()
        job.result()
        logging.info("Finished")
        import time
        time.sleep(1)
        print(job.stdout())
        print(job.stderr(), file=sys.stderr)
        logging.info("Finished")
    else:
        f()