def get_executor(flags, logdir): if flags.local: executor = submitit.LocalExecutor(folder=logdir) else: executor = submitit.SlurmExecutor(folder=logdir) if flags.test_mode is None: executor.update_parameters( partition="learnfair", time=600, nodes=1, ntasks_per_node=1, job_name="mvfstrl", num_gpus=2, cpus_per_task=80, mem="64GB", constraint="pascal", ) else: executor.update_parameters( partition="learnfair", time=120, nodes=1, ntasks_per_node=1, job_name="mvfstrl", num_gpus=0, cpus_per_task=80, mem="64GB", ) return executor
def launch( self, job_overrides: Sequence[Sequence[str]], initial_job_idx: int ) -> Sequence[JobReturn]: # lazy import to ensure plugin discovery remains fast import submitit num_jobs = len(job_overrides) assert num_jobs > 0 # make sure you don't change inplace queue_parameters = self.queue_parameters.copy() OmegaConf.set_struct(queue_parameters, True) if self.queue == "auto": max_num_timeout = self.queue_parameters.auto.max_num_timeout with open_dict(queue_parameters): del queue_parameters.auto["max_num_timeout"] executor = submitit.AutoExecutor( folder=self.folder, max_num_timeout=max_num_timeout ) elif self.queue == "slurm": max_num_timeout = self.queue_parameters.slurm.max_num_timeout with open_dict(queue_parameters): del queue_parameters.slurm["max_num_timeout"] executor = submitit.SlurmExecutor( folder=self.folder, max_num_timeout=max_num_timeout ) elif self.queue == "local": executor = submitit.LocalExecutor(folder=self.folder) else: raise RuntimeError("Unsupported queue type {}".format(self.queue)) executor.update_parameters(**queue_parameters[self.queue]) log.info("Sweep output dir : {}".format(self.config.hydra.sweep.dir)) sweep_dir = Path(str(self.config.hydra.sweep.dir)) sweep_dir.mkdir(parents=True, exist_ok=True) if "mode" in self.config.hydra.sweep: mode = int(str(self.config.hydra.sweep.mode), 8) os.chmod(sweep_dir, mode=mode) params = [] for idx, overrides in enumerate(job_overrides): idx = initial_job_idx + idx lst = " ".join(filter_overrides(overrides)) log.info(f"\t#{idx} : {lst}") params.append( ( list(overrides), "hydra.sweep.dir", idx, f"job_id_for_{idx}", Singleton.get_state(), ) ) jobs = executor.map_array(self, *zip(*params)) return [j.results()[0] for j in jobs]
def launch_slurm(self, explog, slurm_kwargs=None, executor=None): if executor is None: executor = submitit.SlurmExecutor(folder=os.path.join( self.exp.logdir, 'slurm'), max_num_timeout=3) executor.update_parameters(**slurm_kwargs) slurm_job = executor.submit(self, explog) self.job_id = slurm_job.job_id return slurm_job
def _build_slurm_executor(exp_handle, cfg): executor = submitit.SlurmExecutor(folder=exp_handle.slurm_path) assert cfg.num_gpus < 8 or cfg.num_gpus % 8 == 0, cfg.num_gpus if cfg.num_gpus: gpus = min(cfg.num_gpus, 8) nodes = max(1, cfg.num_gpus // 8) assert (gpus * nodes == cfg.num_gpus ), "Must use 8 gpus per machine when multiple nodes are used." else: gpus = 0 nodes = 1 if cfg.single_task_per_node: ntasks_per_node = 1 else: ntasks_per_node = gpus slurm_params = dict( job_name=exp_handle.exp_id, partition=cfg.partition, time=int(cfg.hours * 60), nodes=nodes, num_gpus=gpus, ntasks_per_node=ntasks_per_node, mem=f"{cfg.mem_per_gpu * max(1, gpus)}GB", signal_delay_s=90, comment=cfg.comment or "", ) if cfg.cpus_per_gpu: slurm_params[ "cpus_per_task"] = cfg.cpus_per_gpu * gpus // ntasks_per_node if cfg.volta32: slurm_params["constraint"] = "volta32gb" if cfg.pascal: slurm_params["constraint"] = "pascal" if cfg.volta: slurm_params["constraint"] = "volta" if is_aws(): slurm_params["mem"] = 0 slurm_params["cpus_per_task"] = 1 slurm_params["partition"] = "compute" if "constraint" in slurm_params: del slurm_params["constraint"] logging.info("Slurm params: %s", slurm_params) executor.update_parameters(**slurm_params) return executor
def copy_and_run_with_config(run_fn, run_config, directory, **cluster_config): working_directory = pathlib.Path(directory) / cluster_config["job_name"] ignore_list = [ "lightning_logs", "logs", "checkpoints", "experiments", ".git", "output", "val.csv", "train.csv", ] shutil.copytree(".", working_directory, ignore=lambda x, y: ignore_list) os.chdir(working_directory) print(f"Running at {working_directory}") executor = submitit.SlurmExecutor(folder=working_directory) executor.update_parameters(**cluster_config) job = executor.submit(init_and_run, run_fn, run_config) print(f"job_id: {job}")
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved import submitit from compert.train import train_compert, parse_arguments if __name__ == "__main__": args = parse_arguments() executor = submitit.SlurmExecutor(folder=args["save_dir"]) executor.update_parameters(time=3 * 24 * 60, gpus_per_node=1, array_parallelism=512, cpus_per_task=4, partition="learnfair") commands = [] for seed in range(args["sweep_seeds"]): these_args = dict(args) these_args["seed"] = seed commands.append(these_args) executor.map_array(train_compert, commands)
parser.add_argument('--pretrained', type=str, default='no', help='String indicating which pretrained model to use.', choices=[ 'no', 'touch', 'touch_unoccluded', 'touch_occluded', 'unoccluded', 'occluded' ]) parser.add_argument('--visualize', action='store_true', default=False) args = parser.parse_args() trainer = recon.Engine(args) submitit_logs_dir = os.path.join('experiments', 'logs', args.exp_type, args.exp_id) executor = submitit.SlurmExecutor(submitit_logs_dir, max_num_timeout=3) if args.eval: time = 30 else: time = 60 * 48 executor.update_parameters( num_gpus=1, partition='', cpus_per_task=16, mem=500000, time=time, job_name=args.exp_id, signal_delay_s=300, ) executor.submit(trainer)
"dim_spu": args["dim_spu"], "n_envs": args["n_envs"], "num_samples": args["num_samples"], "data_seed": data_seed, "model_seed": model_seed, "output_dir": args["output_dir"], "callback": args["callback"] } all_jobs.append(train_args) random.shuffle(all_jobs) print("Launching {} jobs...".format(len(all_jobs))) if args["cluster"]: executor = submitit.SlurmExecutor( folder=f"/checkpoint/{getpass.getuser()}/submitit/") executor.update_parameters( time=3*24*60, gpus_per_node=0, array_parallelism=args["jobs_cluster"], cpus_per_task=1, comment="", partition="learnfair") executor.map_array(main.run_experiment, all_jobs) else: for job in all_jobs: print(main.run_experiment(job))
def main(argv): now = datetime.datetime.now().strftime("%y-%m-%d_%H-%M-%S-%f") rootdir = os.path.expanduser(f"~/{FLAGS.name}") submitit_dir = os.path.expanduser(f"~/{FLAGS.name}/{now}") executor = submitit.SlurmExecutor(folder=submitit_dir) os.makedirs(submitit_dir, exist_ok=True) symlink = os.path.join(rootdir, "latest") if os.path.islink(symlink): os.remove(symlink) if not os.path.exists(symlink): os.symlink(submitit_dir, symlink) print("Symlinked experiment directory: %s", symlink) all_args = list() with open(os.path.expanduser(FLAGS.path), "r") as f: cmds = "".join(f.readlines()).split("\n\n") cmds = [cmd.split("\\\n")[1:] for cmd in cmds] cmds = [cmd for cmd in cmds if len(cmd) > 0] for line in cmds: le_args = dict() # print(line) for pair in line: key, val = pair.strip()[2:].split("=") le_args[key] = val if "run_name" not in le_args: le_args["run_name"] = generate_slug() all_args.append(le_args) executor.update_parameters( # examples setup partition="learnfair", # partition="priority", comment="ICLR 2021 submission", # time=1 * 24 * 60, time=1 * 12 * 60, nodes=1, ntasks_per_node=1, # job setup job_name=FLAGS.name, mem="60GB", cpus_per_task=20, num_gpus=1, # constraint="volta32gb", array_parallelism=100, ) print("\nAbout to submit", len(all_args), "jobs") if not FLAGS.debug: job = executor.map_array(launch_experiment_and_remotenv, all_args) for j in job: print("Submitted with job id: ", j.job_id) print(f"stdout -> {submitit_dir}/{j.job_id}_0_log.out") print(f"stderr -> {submitit_dir}/{j.job_id}_0_log.err") print(f"Submitted {len(job)} jobs!") print() print(submitit_dir)
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved import submitit from compert.train import train_compert, parse_arguments import json import sys if __name__ == "__main__": json_file = sys.argv[1] with open(json_file, "r") as f: commands = [json.loads(line) for line in f.readlines()] executor = submitit.SlurmExecutor(folder="/checkpoint/dlp/sweep_jsonl/") executor.update_parameters(time=3 * 24 * 60, gpus_per_node=1, array_parallelism=512, cpus_per_task=4, comment="Deadline nat biotech this week", partition="priority") executor.map_array(train_compert, commands)