def launch( self, job_overrides: Sequence[Sequence[str]], initial_job_idx: int ) -> Sequence[JobReturn]: setup_globals() assert self.hydra_context is not None assert self.config is not None assert self.task_function is not None configure_log(self.config.hydra.hydra_logging, self.config.hydra.verbose) sweep_dir = self.config.hydra.sweep.dir Path(str(sweep_dir)).mkdir(parents=True, exist_ok=True) log.info("Launching {} jobs on slurm".format(len(job_overrides))) runs: List[JobReturn] = [] for idx, overrides in enumerate(job_overrides): idx = initial_job_idx + idx lst = " ".join(filter_overrides(overrides)) log.info(f"\t#{idx} : {lst}") sweep_config = self.hydra_context.config_loader.load_sweep_config( self.config, list(overrides) ) with open_dict(sweep_config): sweep_config.hydra.job.id = idx sweep_config.hydra.job.num = idx HydraConfig.instance().set_config(sweep_config) log.info("\tJob name : {}".format(slurm_utils.resolve_name(sweep_config.slurm.job_name))) slurm_utils.write_slurm(sweep_config) slurm_utils.write_sh(sweep_config, " ".join(filter_overrides(overrides))) slurm_utils.launch_job(sweep_config) configure_log(self.config.hydra.hydra_logging, self.config.hydra.verbose) if sweep_config.wait: time.sleep(1) return runs
def launch(self, job_overrides: Sequence[Sequence[str]], initial_job_idx: int) -> Sequence[JobReturn]: setup_globals() assert self.config is not None assert self.task_function is not None assert self.config_loader is not None configure_log(self.config.hydra.hydra_logging, self.config.hydra.verbose) sweep_dir = self.config.hydra.sweep.dir Path(str(sweep_dir)).mkdir(parents=True, exist_ok=True) log.info(f"Launching {len(job_overrides)} jobs locally") runs: List[JobReturn] = [] for idx, overrides in enumerate(job_overrides): idx = initial_job_idx + idx lst = " ".join(filter_overrides(overrides)) log.info(f"\t#{idx} : {lst}") sweep_config = self.config_loader.load_sweep_config( self.config, list(overrides)) with open_dict(sweep_config): sweep_config.hydra.job.id = idx sweep_config.hydra.job.num = idx ret = run_job( config=sweep_config, task_function=self.task_function, job_dir_key="hydra.sweep.dir", job_subdir_key="hydra.sweep.subdir", ) runs.append(ret) configure_log(self.config.hydra.hydra_logging, self.config.hydra.verbose) return runs
def dispatch_job( idx: int, overrides: Sequence[str], config_loader: ConfigLoader, config: DictConfig, task_function: TaskFunction, singleton_state: Dict[Any, Any], ) -> JobReturn: """Calls `run_job` in parallel Note that Joblib's default backend runs isolated Python processes, see https://joblib.readthedocs.io/en/latest/parallel.html#shared-memory-semantics """ setup_globals() Singleton.set_state(singleton_state) log.info("\t#{} : {}".format(idx, " ".join(filter_overrides(overrides)))) sweep_config = config_loader.load_sweep_config(config, list(overrides)) with open_dict(sweep_config): sweep_config.hydra.job.id = "{}_{}".format(sweep_config.hydra.job.name, idx) sweep_config.hydra.job.num = idx HydraConfig.instance().set_config(sweep_config) ret = run_job( config=sweep_config, task_function=task_function, job_dir_key="hydra.sweep.dir", job_subdir_key="hydra.sweep.subdir", ) return ret
def launch(self, job_overrides: Sequence[Sequence[str]]) -> Sequence[JobReturn]: setup_globals() assert self.config is not None assert self.task_function is not None assert self.config_loader is not None configure_log(self.config.hydra.hydra_logging, self.config.hydra.verbose) sweep_dir = self.config.hydra.sweep.dir Path(str(sweep_dir)).mkdir(parents=True, exist_ok=True) log.info("Launching {} jobs locally".format(len(job_overrides))) runs: List[JobReturn] = [] for idx, overrides in enumerate(job_overrides): log.info("\t#{} : {}".format(idx, " ".join( filter_overrides(overrides)))) sweep_config = self.config_loader.load_sweep_config( self.config, list(overrides)) with open_dict(sweep_config): sweep_config.hydra.job.id = idx sweep_config.hydra.job.num = idx HydraConfig.instance().set_config(sweep_config) ret = run_job( config=sweep_config, task_function=self.task_function, job_dir_key="hydra.sweep.dir", job_subdir_key="hydra.sweep.subdir", ) runs.append(ret) configure_log(self.config.hydra.hydra_logging, self.config.hydra.verbose) return runs
def launch( self, job_overrides: Sequence[Sequence[str]], initial_job_idx: int ) -> Sequence[JobReturn]: # lazy import to ensure plugin discovery remains fast import submitit num_jobs = len(job_overrides) assert num_jobs > 0 params = self.params # build executor init_params = {"folder": self.params["submitit_folder"]} specific_init_keys = {"max_num_timeout"} init_params.update( **{ f"{self._EXECUTOR}_{x}": y for x, y in params.items() if x in specific_init_keys } ) init_keys = specific_init_keys | {"submitit_folder"} executor = submitit.AutoExecutor(cluster=self._EXECUTOR, **init_params) # specify resources/parameters baseparams = set(dataclasses.asdict(BaseTarget()).keys()) params = { x if x in baseparams else f"{self._EXECUTOR}_{x}": y for x, y in params.items() if x not in init_keys } executor.update_parameters(**params) log.info( f"Submitit '{self._EXECUTOR}' sweep output dir : " f"{self.config.hydra.sweep.dir}" ) sweep_dir = Path(str(self.config.hydra.sweep.dir)) sweep_dir.mkdir(parents=True, exist_ok=True) if "mode" in self.config.hydra.sweep: mode = int(str(self.config.hydra.sweep.mode), 8) os.chmod(sweep_dir, mode=mode) params = [] for idx, overrides in enumerate(job_overrides): idx = initial_job_idx + idx lst = " ".join(filter_overrides(overrides)) log.info(f"\t#{idx} : {lst}") params.append( ( list(overrides), "hydra.sweep.dir", idx, f"job_id_for_{idx}", Singleton.get_state(), ) ) jobs = executor.map_array(self, *zip(*params)) return [j.results()[0] for j in jobs]
def launch(self, job_overrides: Sequence[Sequence[str]], initial_job_idx: int) -> Sequence[JobReturn]: """ :param job_overrides: a List of List<String>, where each inner list is the arguments for one job run. :param initial_job_idx: Initial job idx in batch. :return: an array of return values from run_job with indexes corresponding to the input list indexes. """ setup_globals() assert self.config is not None assert self.hydra_context is not None assert self.task_function is not None configure_log(self.config.hydra.hydra_logging, self.config.hydra.verbose) sweep_dir = Path(str(self.config.hydra.sweep.dir)) sweep_dir.mkdir(parents=True, exist_ok=True) log.info( f"Example Launcher(foo={self.foo}, bar={self.bar}) is launching {len(job_overrides)} jobs locally" ) log.info(f"Sweep output dir : {sweep_dir}") runs = [] for idx, overrides in enumerate(job_overrides): idx = initial_job_idx + idx lst = " ".join(filter_overrides(overrides)) log.info(f"\t#{idx} : {lst}") sweep_config = self.hydra_context.config_loader.load_sweep_config( self.config, list(overrides)) with open_dict(sweep_config): # This typically coming from the underlying scheduler (SLURM_JOB_ID for instance) # In that case, it will not be available here because we are still in the main process. # but instead should be populated remotely before calling the task_function. sweep_config.hydra.job.id = f"job_id_for_{idx}" sweep_config.hydra.job.num = idx # If your launcher is executing code in a different process, it is important to restore # the singleton state in the new process. # To do this, you will likely need to serialize the singleton state along with the other # parameters passed to the child process. # happening on this process (executing launcher) state = Singleton.get_state() # happening on the spawned process (executing task_function in run_job) Singleton.set_state(state) ret = run_job( hydra_context=self.hydra_context, task_function=self.task_function, config=sweep_config, job_dir_key="hydra.sweep.dir", job_subdir_key="hydra.sweep.subdir", ) runs.append(ret) # reconfigure the logging subsystem for Hydra as the run_job call configured it for the Job. # This is needed for launchers that calls run_job in the same process and not spawn a new one. configure_log(self.config.hydra.hydra_logging, self.config.hydra.verbose) return runs
def launch( launcher: RayAWSLauncher, job_overrides: Sequence[Sequence[str]], initial_job_idx: int, ) -> Sequence[JobReturn]: setup_globals() assert launcher.config is not None assert launcher.config_loader is not None assert launcher.task_function is not None setup_commands = launcher.env_setup.commands with read_write(setup_commands): setup_commands.extend([ f"pip install {package}=={version}" for package, version in launcher.env_setup.pip_packages.items() ]) setup_commands.extend(launcher.ray_cfg.cluster.setup_commands) with read_write(launcher.ray_cfg.cluster): launcher.ray_cfg.cluster.setup_commands = setup_commands configure_log(launcher.config.hydra.hydra_logging, launcher.config.hydra.verbose) log.info(f"Ray Launcher is launching {len(job_overrides)} jobs, ") with tempfile.TemporaryDirectory() as local_tmp_dir: sweep_configs = [] for idx, overrides in enumerate(job_overrides): idx = initial_job_idx + idx ostr = " ".join(filter_overrides(overrides)) log.info(f"\t#{idx} : {ostr}") sweep_config = launcher.config_loader.load_sweep_config( launcher.config, list(overrides)) with open_dict(sweep_config): # job.id will be set on the EC2 instance before running the job. sweep_config.hydra.job.num = idx sweep_configs.append(sweep_config) _pickle_jobs( tmp_dir=local_tmp_dir, sweep_configs=sweep_configs, # type: ignore task_function=launcher.task_function, singleton_state=Singleton.get_state(), ) with tempfile.NamedTemporaryFile(suffix=".yaml", delete=False) as f: with open(f.name, "w") as file: OmegaConf.save(config=launcher.ray_cfg.cluster, f=file.name, resolve=True) launcher.ray_yaml_path = f.name log.info( f"Saving RayClusterConf in a temp yaml file: {launcher.ray_yaml_path}." ) return launch_jobs(launcher, local_tmp_dir, Path(HydraConfig.get().sweep.dir))
def launch(self, job_overrides: Sequence[Sequence[str]], initial_job_idx: int) -> Sequence[JobReturn]: # lazy import to ensure plugin discovery remains fast import submitit num_jobs = len(job_overrides) assert num_jobs > 0 # make sure you don't change inplace queue_parameters = self.queue_parameters.copy() OmegaConf.set_struct(queue_parameters, True) executors = { QueueType.auto: submitit.AutoExecutor, QueueType.slurm: submitit.SlurmExecutor, QueueType.local: submitit.LocalExecutor, } init_parameters = { "cluster", "max_num_timeout", "slurm_max_num_timeout" } executor = executors[self.queue]( folder=self.folder, **{ x: y for x, y in queue_parameters[self.queue.value].items() if x in init_parameters }, ) executor.update_parameters( **{ x: y for x, y in queue_parameters[self.queue.value].items() if x not in init_parameters }) log.info("Submitit '{}' sweep output dir : {}".format( self.queue.value, self.config.hydra.sweep.dir)) sweep_dir = Path(str(self.config.hydra.sweep.dir)) sweep_dir.mkdir(parents=True, exist_ok=True) if "mode" in self.config.hydra.sweep: mode = int(str(self.config.hydra.sweep.mode), 8) os.chmod(sweep_dir, mode=mode) params = [] for idx, overrides in enumerate(job_overrides): idx = initial_job_idx + idx lst = " ".join(filter_overrides(overrides)) log.info(f"\t#{idx} : {lst}") params.append(( list(overrides), "hydra.sweep.dir", idx, f"job_id_for_{idx}", Singleton.get_state(), )) jobs = executor.map_array(self, *zip(*params)) return [j.results()[0] for j in jobs]
def launch( self, job_overrides: Sequence[Sequence[str]], initial_job_idx: int ) -> Sequence[JobReturn]: # lazy import to ensure plugin discovery remains fast import submitit num_jobs = len(job_overrides) assert num_jobs > 0 # make sure you don't change inplace queue_parameters = self.queue_parameters.copy() OmegaConf.set_struct(queue_parameters, True) if self.queue == "auto": max_num_timeout = self.queue_parameters.auto.max_num_timeout with open_dict(queue_parameters): del queue_parameters.auto["max_num_timeout"] executor = submitit.AutoExecutor( folder=self.folder, max_num_timeout=max_num_timeout ) elif self.queue == "slurm": max_num_timeout = self.queue_parameters.slurm.max_num_timeout with open_dict(queue_parameters): del queue_parameters.slurm["max_num_timeout"] executor = submitit.SlurmExecutor( folder=self.folder, max_num_timeout=max_num_timeout ) elif self.queue == "local": executor = submitit.LocalExecutor(folder=self.folder) else: raise RuntimeError("Unsupported queue type {}".format(self.queue)) executor.update_parameters(**queue_parameters[self.queue]) log.info("Sweep output dir : {}".format(self.config.hydra.sweep.dir)) sweep_dir = Path(str(self.config.hydra.sweep.dir)) sweep_dir.mkdir(parents=True, exist_ok=True) if "mode" in self.config.hydra.sweep: mode = int(str(self.config.hydra.sweep.mode), 8) os.chmod(sweep_dir, mode=mode) params = [] for idx, overrides in enumerate(job_overrides): idx = initial_job_idx + idx lst = " ".join(filter_overrides(overrides)) log.info(f"\t#{idx} : {lst}") params.append( ( list(overrides), "hydra.sweep.dir", idx, f"job_id_for_{idx}", Singleton.get_state(), ) ) jobs = executor.map_array(self, *zip(*params)) return [j.results()[0] for j in jobs]
def launch( launcher: RayAWSLauncher, job_overrides: Sequence[Sequence[str]], initial_job_idx: int, ) -> Sequence[JobReturn]: setup_globals() assert launcher.config is not None assert launcher.hydra_context is not None assert launcher.task_function is not None setup_commands = launcher.env_setup.commands packages = filter( lambda x: x[1] is not None, launcher.env_setup.pip_packages.items() ) with read_write(setup_commands): setup_commands.extend( [f"pip install {package}=={version}" for package, version in packages] ) setup_commands.extend(launcher.ray_cfg.cluster.setup_commands) with read_write(launcher.ray_cfg.cluster): launcher.ray_cfg.cluster.setup_commands = setup_commands configure_log(launcher.config.hydra.hydra_logging, launcher.config.hydra.verbose) logging_config = OmegaConf.to_container( launcher.logging, resolve=True, enum_to_str=True ) sdk.configure_logging(**logging_config) log.info(f"Ray Launcher is launching {len(job_overrides)} jobs, ") with tempfile.TemporaryDirectory() as local_tmp_dir: sweep_configs = [] for idx, overrides in enumerate(job_overrides): idx = initial_job_idx + idx ostr = " ".join(filter_overrides(overrides)) log.info(f"\t#{idx} : {ostr}") sweep_config = launcher.hydra_context.config_loader.load_sweep_config( launcher.config, list(overrides) ) with open_dict(sweep_config): # job.id will be set on the EC2 instance before running the job. sweep_config.hydra.job.num = idx sweep_configs.append(sweep_config) _pickle_jobs( tmp_dir=local_tmp_dir, hydra_context=launcher.hydra_context, sweep_configs=sweep_configs, # type: ignore task_function=launcher.task_function, singleton_state=Singleton.get_state(), ) return launch_jobs( launcher, local_tmp_dir, Path(launcher.config.hydra.sweep.dir) )
def launch( launcher: JoblibLauncher, job_overrides: Sequence[Sequence[str]], initial_job_idx: int, ) -> Sequence[JobReturn]: """ :param job_overrides: a List of List<String>, where each inner list is the arguments for one job run. :param initial_job_idx: Initial job idx in batch. :return: an array of return values from run_job with indexes corresponding to the input list indexes. """ setup_globals() assert launcher.config is not None assert launcher.task_function is not None assert launcher.hydra_context is not None configure_log(launcher.config.hydra.hydra_logging, launcher.config.hydra.verbose) sweep_dir = Path(str(launcher.config.hydra.sweep.dir)) sweep_dir.mkdir(parents=True, exist_ok=True) # Joblib's backend is hard-coded to loky since the threading # backend is incompatible with Hydra joblib_cfg = launcher.joblib joblib_cfg["backend"] = "loky" process_joblib_cfg(joblib_cfg) log.info( "Joblib.Parallel({}) is launching {} jobs".format( ",".join([f"{k}={v}" for k, v in joblib_cfg.items()]), len(job_overrides), ) ) log.info("Launching jobs, sweep output dir : {}".format(sweep_dir)) for idx, overrides in enumerate(job_overrides): log.info("\t#{} : {}".format(idx, " ".join(filter_overrides(overrides)))) singleton_state = Singleton.get_state() runs = Parallel(**joblib_cfg)( delayed(execute_job)( initial_job_idx + idx, overrides, launcher.hydra_context, launcher.config, launcher.task_function, singleton_state, ) for idx, overrides in enumerate(job_overrides) ) assert isinstance(runs, List) for run in runs: assert isinstance(run, JobReturn) return runs
def launch(self, job_overrides: Sequence[Sequence[str]], initial_job_idx: int) -> Sequence[JobReturn]: """ :param job_overrides: a List of List<String>, where each inner list is the arguments for one job run. :param initial_job_idx: Initial job idx in batch. :return: an array of return values from run_job with indexes corresponding to the input list indexes. """ setup_globals() assert self.config is not None assert self.config_loader is not None assert self.task_function is not None configure_log(self.config.hydra.hydra_logging, self.config.hydra.verbose) sweep_dir = Path(str(self.config.hydra.sweep.dir)) sweep_dir.mkdir(parents=True, exist_ok=True) log.info( "Example Launcher(foo={}, bar={}) is launching {} jobs locally". format(self.foo, self.bar, len(job_overrides))) log.info("Sweep output dir : {}".format(sweep_dir)) runs = [] for idx, overrides in enumerate(job_overrides): idx = initial_job_idx + idx log.info("\t#{} : {}".format(idx, " ".join( filter_overrides(overrides)))) sweep_config = self.config_loader.load_sweep_config( self.config, list(overrides)) with open_dict(sweep_config): # This typically coming from the underlying scheduler (SLURM_JOB_ID for instance) # In that case, it will not be available here because we are still in the main process. # but instead should be populated remotely before calling the task_function. sweep_config.hydra.job.id = "job_id_for_{}".format(idx) sweep_config.hydra.job.num = idx HydraConfig.instance().set_config(sweep_config) ret = run_job( config=sweep_config, task_function=self.task_function, job_dir_key="hydra.sweep.dir", job_subdir_key="hydra.sweep.subdir", ) runs.append(ret) # reconfigure the logging subsystem for Hydra as the run_job call configured it for the Job. # This is needed for launchers that calls run_job in the same process and not spawn a new one. configure_log(self.config.hydra.hydra_logging, self.config.hydra.verbose) return runs
def launch( launcher: RayLocalLauncher, job_overrides: Sequence[Sequence[str]], initial_job_idx: int, ) -> Sequence[JobReturn]: setup_globals() assert launcher.config is not None assert launcher.config_loader is not None assert launcher.task_function is not None configure_log(launcher.config.hydra.hydra_logging, launcher.config.hydra.verbose) sweep_dir = Path(str(launcher.config.hydra.sweep.dir)) sweep_dir.mkdir(parents=True, exist_ok=True) log.info( f"Ray Launcher is launching {len(job_overrides)} jobs, " f"sweep output dir: {sweep_dir}" ) start_ray(launcher.ray_init_cfg) runs = [] for idx, overrides in enumerate(job_overrides): idx = initial_job_idx + idx ostr = " ".join(filter_overrides(overrides)) log.info(f"\t#{idx} : {ostr}") sweep_config = launcher.config_loader.load_sweep_config( launcher.config, list(overrides) ) with open_dict(sweep_config): # This typically coming from the underlying scheduler (SLURM_JOB_ID for instance) # In that case, it will not be available here because we are still in the main process. # but instead should be populated remotely before calling the task_function. sweep_config.hydra.job.id = f"job_id_for_{idx}" sweep_config.hydra.job.num = idx ray_obj = launch_job_on_ray( launcher.ray_remote_cfg, sweep_config, launcher.task_function, Singleton.get_state(), ) runs.append(ray_obj) return [ray.get(run) for run in runs]
def launch(self, job_overrides: Sequence[Sequence[str]]) -> Sequence[JobReturn]: """ :param job_overrides: a List of List<String>, where each inner list is the arguments for one job run. :return: an array of return values from run_job with indexes corresponding to the input list indexes. """ setup_globals() assert self.config is not None assert self.config_loader is not None assert self.task_function is not None configure_log(self.config.hydra.hydra_logging, self.config.hydra.verbose) sweep_dir = Path(str(self.config.hydra.sweep.dir)) sweep_dir.mkdir(parents=True, exist_ok=True) log.info("Joblib.Parallel({}) is launching {} jobs".format( ",".join([f"{k}={v}" for k, v in self.joblib.items()]), len(job_overrides), )) log.info("Launching jobs, sweep output dir : {}".format(sweep_dir)) singleton_state = Singleton.get_state() for idx, overrides in enumerate(job_overrides): log.info("\t#{} : {}".format(idx, " ".join( filter_overrides(overrides)))) runs = Parallel(**self.joblib)(delayed(execute_job)( idx, overrides, self.config_loader, self.config, self.task_function, singleton_state, ) for idx, overrides in enumerate(job_overrides)) assert isinstance(runs, List) for run in runs: assert isinstance(run, JobReturn) return runs
def launch( launcher: RQLauncher, job_overrides: Sequence[Sequence[str]], initial_job_idx: int, ) -> JobReturn: """ :param job_overrides: a List of List<String>, where each inner list is the arguments for one job run. :param initial_job_idx: Initial job idx in batch. :return: an array of return values from run_job with indexes corresponding to the input list indexes. """ setup_globals() assert launcher.config is not None assert launcher.config_loader is not None assert launcher.task_function is not None configure_log(launcher.config.hydra.hydra_logging, launcher.config.hydra.verbose) sweep_dir = Path(str(launcher.config.hydra.sweep.dir)) sweep_dir.mkdir(parents=True, exist_ok=True) # RQ configuration rq_cfg = launcher.rq # Redis configuration is_async = not rq_cfg.redis.mock if is_async: connection = Redis( host=rq_cfg.redis.host, port=rq_cfg.redis.port, db=rq_cfg.redis.db, password=rq_cfg.redis.password, ) else: log.info("Running in synchronous mode") connection = FakeStrictRedis() queue = Queue( name=rq_cfg.queue, connection=connection, is_async=is_async, serializer=cloudpickle, ) # Enqueue jobs jobs = [] singleton_state = Singleton.get_state() log.info( f"RQ Launcher is enqueuing {len(job_overrides)} job(s) in queue : {rq_cfg.queue}" ) log.info("Sweep output dir : {}".format(sweep_dir)) if not sweep_dir.is_absolute(): log.warn( "Using relative sweep dir: Please be aware that dir will be relative to where workers are started from." ) for idx, overrides in enumerate(job_overrides): description = " ".join(filter_overrides(overrides)) enqueue_keywords = OmegaConf.to_container(rq_cfg.enqueue, resolve=True) if enqueue_keywords["job_timeout"] is None: enqueue_keywords["job_timeout"] = -1 if enqueue_keywords["result_ttl"] is None: enqueue_keywords["result_ttl"] = -1 if enqueue_keywords["failure_ttl"] is None: enqueue_keywords["failure_ttl"] = -1 if enqueue_keywords["job_id"] is None: enqueue_keywords["job_id"] = str(uuid.uuid4()) if enqueue_keywords["description"] is None: enqueue_keywords["description"] = description sweep_config = launcher.config_loader.load_sweep_config( launcher.config, list(overrides)) with open_dict(sweep_config): sweep_config.hydra.job.id = enqueue_keywords["job_id"] sweep_config.hydra.job.num = initial_job_idx + idx job = queue.enqueue( execute_job, sweep_config=sweep_config, task_function=launcher.task_function, singleton_state=singleton_state, **enqueue_keywords, ) jobs.append(job) log.info(f"Enqueued {job.get_id()}") log.info(f"\t#{idx+1} : {description}") log.info("Finished enqueuing") if rq_cfg.stop_after_enqueue: raise StopAfterEnqueue log.info(f"Polling job statuses every {rq_cfg.wait_polling} sec") while True: job_ids_done = [ job.get_id() for job in jobs if job.get_status() in ["finished", "failed"] ] if len(job_ids_done) == len(jobs): break else: time.sleep(rq_cfg.wait_polling) runs = [] for job in jobs: result = job.result if job.result is not None else None runs.append(result) assert isinstance(runs, List) for run in runs: assert isinstance(run, JobReturn) return runs
def launch(self, job_overrides: Sequence[Sequence[str]], initial_job_idx: int) -> Sequence[JobReturn]: """Implementation of Launcher.launch :param job_overrides: a List of List<String>, where each inner list is the arguments for one job run. :param initial_job_idx: Initial job idx in batch. :return: an array of return values from run_job with indexes corresponding to the input list indexes. """ setup_globals() assert self.config is not None assert self.config_loader is not None assert self.task_function is not None configure_log(self.config.hydra.hydra_logging, self.config.hydra.verbose) sweep_dir = self.config.hydra.sweep.dir Path(str(sweep_dir)).mkdir(parents=True, exist_ok=True) logger.info( f"Local Launcher is launching {len(job_overrides)} jobs locally") logger.info(f"Launching jobs, sweep output dir : {sweep_dir}") for idx, overrides in enumerate(job_overrides): logger.info("\t#{} : {}".format( idx, " ".join(filter_overrides(overrides)))) results = [] workers = [] for i, overrides in enumerate(job_overrides): idx = initial_job_idx + i lst = " ".join(filter_overrides(overrides)) logger.info(f"\t#{idx} : {lst}") sweep_config = self.config_loader.load_sweep_config( self.config, list(overrides)) with open_dict(sweep_config): sweep_config.hydra.job.id = f"job_id_for_{idx}" sweep_config.hydra.job.num = idx p = Process(target=run_job, kwargs=dict(config=sweep_config, task_function=self.task_function, job_dir_key="hydra.sweep.dir", job_subdir_key="hydra.sweep.subdir")) p.start() workers.append(p) # wait for current/last batch of workers if ((i + 1) % self._n_jobs == 0) or ((i + 1) == len(job_overrides)): for w in workers: w.join() # forward exceptions from the workers if w.exception(): raise w.exception() # book keeping results.extend([p.result() for p in workers]) workers = [] assert len(results) == len(job_overrides) return results