def launch( self, job_overrides: Sequence[Sequence[str]], initial_job_idx: int ) -> Sequence[JobReturn]: # lazy import to ensure plugin discovery remains fast import submitit num_jobs = len(job_overrides) assert num_jobs > 0 params = self.params # build executor init_params = {"folder": self.params["submitit_folder"]} specific_init_keys = {"max_num_timeout"} init_params.update( **{ f"{self._EXECUTOR}_{x}": y for x, y in params.items() if x in specific_init_keys } ) init_keys = specific_init_keys | {"submitit_folder"} executor = submitit.AutoExecutor(cluster=self._EXECUTOR, **init_params) # specify resources/parameters baseparams = set(dataclasses.asdict(BaseTarget()).keys()) params = { x if x in baseparams else f"{self._EXECUTOR}_{x}": y for x, y in params.items() if x not in init_keys } executor.update_parameters(**params) log.info( f"Submitit '{self._EXECUTOR}' sweep output dir : " f"{self.config.hydra.sweep.dir}" ) sweep_dir = Path(str(self.config.hydra.sweep.dir)) sweep_dir.mkdir(parents=True, exist_ok=True) if "mode" in self.config.hydra.sweep: mode = int(str(self.config.hydra.sweep.mode), 8) os.chmod(sweep_dir, mode=mode) params = [] for idx, overrides in enumerate(job_overrides): idx = initial_job_idx + idx lst = " ".join(filter_overrides(overrides)) log.info(f"\t#{idx} : {lst}") params.append( ( list(overrides), "hydra.sweep.dir", idx, f"job_id_for_{idx}", Singleton.get_state(), ) ) jobs = executor.map_array(self, *zip(*params)) return [j.results()[0] for j in jobs]
def hydra_restore_singletons() -> None: """ Restore singletons state after the function returns """ state = copy.deepcopy(Singleton.get_state()) yield Singleton.set_state(state)
def launch(self, job_overrides: Sequence[Sequence[str]], initial_job_idx: int) -> Sequence[JobReturn]: """ :param job_overrides: a List of List<String>, where each inner list is the arguments for one job run. :param initial_job_idx: Initial job idx in batch. :return: an array of return values from run_job with indexes corresponding to the input list indexes. """ setup_globals() assert self.config is not None assert self.hydra_context is not None assert self.task_function is not None configure_log(self.config.hydra.hydra_logging, self.config.hydra.verbose) sweep_dir = Path(str(self.config.hydra.sweep.dir)) sweep_dir.mkdir(parents=True, exist_ok=True) log.info( f"Example Launcher(foo={self.foo}, bar={self.bar}) is launching {len(job_overrides)} jobs locally" ) log.info(f"Sweep output dir : {sweep_dir}") runs = [] for idx, overrides in enumerate(job_overrides): idx = initial_job_idx + idx lst = " ".join(filter_overrides(overrides)) log.info(f"\t#{idx} : {lst}") sweep_config = self.hydra_context.config_loader.load_sweep_config( self.config, list(overrides)) with open_dict(sweep_config): # This typically coming from the underlying scheduler (SLURM_JOB_ID for instance) # In that case, it will not be available here because we are still in the main process. # but instead should be populated remotely before calling the task_function. sweep_config.hydra.job.id = f"job_id_for_{idx}" sweep_config.hydra.job.num = idx # If your launcher is executing code in a different process, it is important to restore # the singleton state in the new process. # To do this, you will likely need to serialize the singleton state along with the other # parameters passed to the child process. # happening on this process (executing launcher) state = Singleton.get_state() # happening on the spawned process (executing task_function in run_job) Singleton.set_state(state) ret = run_job( hydra_context=self.hydra_context, task_function=self.task_function, config=sweep_config, job_dir_key="hydra.sweep.dir", job_subdir_key="hydra.sweep.subdir", ) runs.append(ret) # reconfigure the logging subsystem for Hydra as the run_job call configured it for the Job. # This is needed for launchers that calls run_job in the same process and not spawn a new one. configure_log(self.config.hydra.hydra_logging, self.config.hydra.verbose) return runs
def launch( launcher: RayAWSLauncher, job_overrides: Sequence[Sequence[str]], initial_job_idx: int, ) -> Sequence[JobReturn]: setup_globals() assert launcher.config is not None assert launcher.config_loader is not None assert launcher.task_function is not None setup_commands = launcher.env_setup.commands with read_write(setup_commands): setup_commands.extend([ f"pip install {package}=={version}" for package, version in launcher.env_setup.pip_packages.items() ]) setup_commands.extend(launcher.ray_cfg.cluster.setup_commands) with read_write(launcher.ray_cfg.cluster): launcher.ray_cfg.cluster.setup_commands = setup_commands configure_log(launcher.config.hydra.hydra_logging, launcher.config.hydra.verbose) log.info(f"Ray Launcher is launching {len(job_overrides)} jobs, ") with tempfile.TemporaryDirectory() as local_tmp_dir: sweep_configs = [] for idx, overrides in enumerate(job_overrides): idx = initial_job_idx + idx ostr = " ".join(filter_overrides(overrides)) log.info(f"\t#{idx} : {ostr}") sweep_config = launcher.config_loader.load_sweep_config( launcher.config, list(overrides)) with open_dict(sweep_config): # job.id will be set on the EC2 instance before running the job. sweep_config.hydra.job.num = idx sweep_configs.append(sweep_config) _pickle_jobs( tmp_dir=local_tmp_dir, sweep_configs=sweep_configs, # type: ignore task_function=launcher.task_function, singleton_state=Singleton.get_state(), ) with tempfile.NamedTemporaryFile(suffix=".yaml", delete=False) as f: with open(f.name, "w") as file: OmegaConf.save(config=launcher.ray_cfg.cluster, f=file.name, resolve=True) launcher.ray_yaml_path = f.name log.info( f"Saving RayClusterConf in a temp yaml file: {launcher.ray_yaml_path}." ) return launch_jobs(launcher, local_tmp_dir, Path(HydraConfig.get().sweep.dir))
def launch(self, job_overrides: Sequence[Sequence[str]], initial_job_idx: int) -> Sequence[JobReturn]: # lazy import to ensure plugin discovery remains fast import submitit num_jobs = len(job_overrides) assert num_jobs > 0 # make sure you don't change inplace queue_parameters = self.queue_parameters.copy() OmegaConf.set_struct(queue_parameters, True) executors = { QueueType.auto: submitit.AutoExecutor, QueueType.slurm: submitit.SlurmExecutor, QueueType.local: submitit.LocalExecutor, } init_parameters = { "cluster", "max_num_timeout", "slurm_max_num_timeout" } executor = executors[self.queue]( folder=self.folder, **{ x: y for x, y in queue_parameters[self.queue.value].items() if x in init_parameters }, ) executor.update_parameters( **{ x: y for x, y in queue_parameters[self.queue.value].items() if x not in init_parameters }) log.info("Submitit '{}' sweep output dir : {}".format( self.queue.value, self.config.hydra.sweep.dir)) sweep_dir = Path(str(self.config.hydra.sweep.dir)) sweep_dir.mkdir(parents=True, exist_ok=True) if "mode" in self.config.hydra.sweep: mode = int(str(self.config.hydra.sweep.mode), 8) os.chmod(sweep_dir, mode=mode) params = [] for idx, overrides in enumerate(job_overrides): idx = initial_job_idx + idx lst = " ".join(filter_overrides(overrides)) log.info(f"\t#{idx} : {lst}") params.append(( list(overrides), "hydra.sweep.dir", idx, f"job_id_for_{idx}", Singleton.get_state(), )) jobs = executor.map_array(self, *zip(*params)) return [j.results()[0] for j in jobs]
def launch( self, job_overrides: Sequence[Sequence[str]], initial_job_idx: int ) -> Sequence[JobReturn]: # lazy import to ensure plugin discovery remains fast import submitit num_jobs = len(job_overrides) assert num_jobs > 0 # make sure you don't change inplace queue_parameters = self.queue_parameters.copy() OmegaConf.set_struct(queue_parameters, True) if self.queue == "auto": max_num_timeout = self.queue_parameters.auto.max_num_timeout with open_dict(queue_parameters): del queue_parameters.auto["max_num_timeout"] executor = submitit.AutoExecutor( folder=self.folder, max_num_timeout=max_num_timeout ) elif self.queue == "slurm": max_num_timeout = self.queue_parameters.slurm.max_num_timeout with open_dict(queue_parameters): del queue_parameters.slurm["max_num_timeout"] executor = submitit.SlurmExecutor( folder=self.folder, max_num_timeout=max_num_timeout ) elif self.queue == "local": executor = submitit.LocalExecutor(folder=self.folder) else: raise RuntimeError("Unsupported queue type {}".format(self.queue)) executor.update_parameters(**queue_parameters[self.queue]) log.info("Sweep output dir : {}".format(self.config.hydra.sweep.dir)) sweep_dir = Path(str(self.config.hydra.sweep.dir)) sweep_dir.mkdir(parents=True, exist_ok=True) if "mode" in self.config.hydra.sweep: mode = int(str(self.config.hydra.sweep.mode), 8) os.chmod(sweep_dir, mode=mode) params = [] for idx, overrides in enumerate(job_overrides): idx = initial_job_idx + idx lst = " ".join(filter_overrides(overrides)) log.info(f"\t#{idx} : {lst}") params.append( ( list(overrides), "hydra.sweep.dir", idx, f"job_id_for_{idx}", Singleton.get_state(), ) ) jobs = executor.map_array(self, *zip(*params)) return [j.results()[0] for j in jobs]
def restore_singletons() -> Any: """ A fixture to restore singletons state after this the function. This is useful for functions that are making a one-off change to singlestons that should not effect other tests """ state = copy.deepcopy(Singleton.get_state()) yield Singleton.set_state(state)
def hydra_restore_singletons() -> None: """ Restore singletons state after the function returns """ state = copy.deepcopy(Singleton.get_state()) resolvers = copy.deepcopy(BaseContainer._resolvers) yield Singleton.set_state(state) BaseContainer._resolvers = resolvers
def launch( launcher: RayAWSLauncher, job_overrides: Sequence[Sequence[str]], initial_job_idx: int, ) -> Sequence[JobReturn]: setup_globals() assert launcher.config is not None assert launcher.hydra_context is not None assert launcher.task_function is not None setup_commands = launcher.env_setup.commands packages = filter( lambda x: x[1] is not None, launcher.env_setup.pip_packages.items() ) with read_write(setup_commands): setup_commands.extend( [f"pip install {package}=={version}" for package, version in packages] ) setup_commands.extend(launcher.ray_cfg.cluster.setup_commands) with read_write(launcher.ray_cfg.cluster): launcher.ray_cfg.cluster.setup_commands = setup_commands configure_log(launcher.config.hydra.hydra_logging, launcher.config.hydra.verbose) logging_config = OmegaConf.to_container( launcher.logging, resolve=True, enum_to_str=True ) sdk.configure_logging(**logging_config) log.info(f"Ray Launcher is launching {len(job_overrides)} jobs, ") with tempfile.TemporaryDirectory() as local_tmp_dir: sweep_configs = [] for idx, overrides in enumerate(job_overrides): idx = initial_job_idx + idx ostr = " ".join(filter_overrides(overrides)) log.info(f"\t#{idx} : {ostr}") sweep_config = launcher.hydra_context.config_loader.load_sweep_config( launcher.config, list(overrides) ) with open_dict(sweep_config): # job.id will be set on the EC2 instance before running the job. sweep_config.hydra.job.num = idx sweep_configs.append(sweep_config) _pickle_jobs( tmp_dir=local_tmp_dir, hydra_context=launcher.hydra_context, sweep_configs=sweep_configs, # type: ignore task_function=launcher.task_function, singleton_state=Singleton.get_state(), ) return launch_jobs( launcher, local_tmp_dir, Path(launcher.config.hydra.sweep.dir) )
def launch( launcher: JoblibLauncher, job_overrides: Sequence[Sequence[str]], initial_job_idx: int, ) -> Sequence[JobReturn]: """ :param job_overrides: a List of List<String>, where each inner list is the arguments for one job run. :param initial_job_idx: Initial job idx in batch. :return: an array of return values from run_job with indexes corresponding to the input list indexes. """ setup_globals() assert launcher.config is not None assert launcher.task_function is not None assert launcher.hydra_context is not None configure_log(launcher.config.hydra.hydra_logging, launcher.config.hydra.verbose) sweep_dir = Path(str(launcher.config.hydra.sweep.dir)) sweep_dir.mkdir(parents=True, exist_ok=True) # Joblib's backend is hard-coded to loky since the threading # backend is incompatible with Hydra joblib_cfg = launcher.joblib joblib_cfg["backend"] = "loky" process_joblib_cfg(joblib_cfg) log.info( "Joblib.Parallel({}) is launching {} jobs".format( ",".join([f"{k}={v}" for k, v in joblib_cfg.items()]), len(job_overrides), ) ) log.info("Launching jobs, sweep output dir : {}".format(sweep_dir)) for idx, overrides in enumerate(job_overrides): log.info("\t#{} : {}".format(idx, " ".join(filter_overrides(overrides)))) singleton_state = Singleton.get_state() runs = Parallel(**joblib_cfg)( delayed(execute_job)( initial_job_idx + idx, overrides, launcher.hydra_context, launcher.config, launcher.task_function, singleton_state, ) for idx, overrides in enumerate(job_overrides) ) assert isinstance(runs, List) for run in runs: assert isinstance(run, JobReturn) return runs
def launch(self, job_overrides: Sequence[Sequence[str]], initial_job_idx: int) -> Sequence[JobReturn]: """ :param job_overrides: a List of List<String>, where each inner list is the arguments for one job run. :param initial_job_idx: Initial job idx in batch. :return: an array of return values from run_job with indexes corresponding to the input list indexes. """ setup_globals() assert self.config is not None assert self.config_loader is not None assert self.task_function is not None configure_log(self.config.hydra.hydra_logging, self.config.hydra.verbose) sweep_dir = Path(str(self.config.hydra.sweep.dir)) sweep_dir.mkdir(parents=True, exist_ok=True) log.info( f"TaskSpooler Launcher is launching {len(job_overrides)} jobs locally" ) log.info(f"Sweep output dir : {sweep_dir}") runs = [] singleton_state = Singleton.get_state() for idx, overrides in enumerate(job_overrides): overrides = list(overrides) overrides.extend(self.hydra_overrides) overrides = tuple(overrides) ret = execute_job( initial_job_idx + idx, overrides, self.config_loader, self.config, self.task_function, singleton_state, self.cmd_prefix, self.tsp_prefix, ) runs.append(ret) time.sleep(self.time_between_submit) assert isinstance(runs, List) for run in runs: assert isinstance(run, JobReturn) if self.tail_jobs: Parallel(n_jobs=len(job_overrides), backend='threading')( delayed(self.tail_job)(run.return_value) for run in runs) return runs
def launch( launcher: RayLocalLauncher, job_overrides: Sequence[Sequence[str]], initial_job_idx: int, ) -> Sequence[JobReturn]: setup_globals() assert launcher.config is not None assert launcher.config_loader is not None assert launcher.task_function is not None configure_log(launcher.config.hydra.hydra_logging, launcher.config.hydra.verbose) sweep_dir = Path(str(launcher.config.hydra.sweep.dir)) sweep_dir.mkdir(parents=True, exist_ok=True) log.info( f"Ray Launcher is launching {len(job_overrides)} jobs, " f"sweep output dir: {sweep_dir}" ) start_ray(launcher.ray_init_cfg) runs = [] for idx, overrides in enumerate(job_overrides): idx = initial_job_idx + idx ostr = " ".join(filter_overrides(overrides)) log.info(f"\t#{idx} : {ostr}") sweep_config = launcher.config_loader.load_sweep_config( launcher.config, list(overrides) ) with open_dict(sweep_config): # This typically coming from the underlying scheduler (SLURM_JOB_ID for instance) # In that case, it will not be available here because we are still in the main process. # but instead should be populated remotely before calling the task_function. sweep_config.hydra.job.id = f"job_id_for_{idx}" sweep_config.hydra.job.num = idx ray_obj = launch_job_on_ray( launcher.ray_remote_cfg, sweep_config, launcher.task_function, Singleton.get_state(), ) runs.append(ray_obj) return [ray.get(run) for run in runs]
def launch(self, job_overrides: Sequence[Sequence[str]]) -> Sequence[JobReturn]: """ :param job_overrides: a List of List<String>, where each inner list is the arguments for one job run. :return: an array of return values from run_job with indexes corresponding to the input list indexes. """ setup_globals() assert self.config is not None assert self.config_loader is not None assert self.task_function is not None configure_log(self.config.hydra.hydra_logging, self.config.hydra.verbose) sweep_dir = Path(str(self.config.hydra.sweep.dir)) sweep_dir.mkdir(parents=True, exist_ok=True) log.info("Joblib.Parallel({}) is launching {} jobs".format( ",".join([f"{k}={v}" for k, v in self.joblib.items()]), len(job_overrides), )) log.info("Launching jobs, sweep output dir : {}".format(sweep_dir)) singleton_state = Singleton.get_state() for idx, overrides in enumerate(job_overrides): log.info("\t#{} : {}".format(idx, " ".join( filter_overrides(overrides)))) runs = Parallel(**self.joblib)(delayed(execute_job)( idx, overrides, self.config_loader, self.config, self.task_function, singleton_state, ) for idx, overrides in enumerate(job_overrides)) assert isinstance(runs, List) for run in runs: assert isinstance(run, JobReturn) return runs
def launch( launcher: RQLauncher, job_overrides: Sequence[Sequence[str]], initial_job_idx: int, ) -> JobReturn: """ :param job_overrides: a List of List<String>, where each inner list is the arguments for one job run. :param initial_job_idx: Initial job idx in batch. :return: an array of return values from run_job with indexes corresponding to the input list indexes. """ setup_globals() assert launcher.config is not None assert launcher.config_loader is not None assert launcher.task_function is not None configure_log(launcher.config.hydra.hydra_logging, launcher.config.hydra.verbose) sweep_dir = Path(str(launcher.config.hydra.sweep.dir)) sweep_dir.mkdir(parents=True, exist_ok=True) # RQ configuration rq_cfg = launcher.rq # Redis configuration is_async = not rq_cfg.redis.mock if is_async: connection = Redis( host=rq_cfg.redis.host, port=rq_cfg.redis.port, db=rq_cfg.redis.db, password=rq_cfg.redis.password, ) else: log.info("Running in synchronous mode") connection = FakeStrictRedis() queue = Queue( name=rq_cfg.queue, connection=connection, is_async=is_async, serializer=cloudpickle, ) # Enqueue jobs jobs = [] singleton_state = Singleton.get_state() log.info( f"RQ Launcher is enqueuing {len(job_overrides)} job(s) in queue : {rq_cfg.queue}" ) log.info("Sweep output dir : {}".format(sweep_dir)) if not sweep_dir.is_absolute(): log.warn( "Using relative sweep dir: Please be aware that dir will be relative to where workers are started from." ) for idx, overrides in enumerate(job_overrides): description = " ".join(filter_overrides(overrides)) enqueue_keywords = OmegaConf.to_container(rq_cfg.enqueue, resolve=True) if enqueue_keywords["job_timeout"] is None: enqueue_keywords["job_timeout"] = -1 if enqueue_keywords["result_ttl"] is None: enqueue_keywords["result_ttl"] = -1 if enqueue_keywords["failure_ttl"] is None: enqueue_keywords["failure_ttl"] = -1 if enqueue_keywords["job_id"] is None: enqueue_keywords["job_id"] = str(uuid.uuid4()) if enqueue_keywords["description"] is None: enqueue_keywords["description"] = description sweep_config = launcher.config_loader.load_sweep_config( launcher.config, list(overrides)) with open_dict(sweep_config): sweep_config.hydra.job.id = enqueue_keywords["job_id"] sweep_config.hydra.job.num = initial_job_idx + idx job = queue.enqueue( execute_job, sweep_config=sweep_config, task_function=launcher.task_function, singleton_state=singleton_state, **enqueue_keywords, ) jobs.append(job) log.info(f"Enqueued {job.get_id()}") log.info(f"\t#{idx+1} : {description}") log.info("Finished enqueuing") if rq_cfg.stop_after_enqueue: raise StopAfterEnqueue log.info(f"Polling job statuses every {rq_cfg.wait_polling} sec") while True: job_ids_done = [ job.get_id() for job in jobs if job.get_status() in ["finished", "failed"] ] if len(job_ids_done) == len(jobs): break else: time.sleep(rq_cfg.wait_polling) runs = [] for job in jobs: result = job.result if job.result is not None else None runs.append(result) assert isinstance(runs, List) for run in runs: assert isinstance(run, JobReturn) return runs
def test_singleton_get_state(hydra_restore_singletons: Any) -> None: s = Singleton.get_state() assert Plugins not in s["instances"] assert Plugins in Singleton._instances Singleton.set_state(s) assert Plugins in Singleton._instances
from hydra._internal.core_plugins.file_config_source import FileConfigSource from hydra._internal.core_plugins.importlib_resources_config_source import ( ImportlibResourcesConfigSource, ) from hydra._internal.core_plugins.structured_config_source import StructuredConfigSource from hydra.core.default_element import GroupDefault, InputDefault from hydra.core.plugins import Plugins from hydra.core.singleton import Singleton from hydra.plugins.config_source import ConfigSource from hydra.test_utils.config_source_common_tests import ConfigSourceTestSuite from hydra.test_utils.test_utils import chdir_hydra_root chdir_hydra_root() # Manually save and restore singletons to work around an issue with things added to the config store via importing. # restoring is done in test_restore_singleton_state_hack(), which must be the last test in this file. state = copy.deepcopy(Singleton.get_state()) @pytest.mark.parametrize( "type_, path", [ pytest.param( FileConfigSource, "file://tests/test_apps/config_source_test/dir", id="FileConfigSource", ), pytest.param( ImportlibResourcesConfigSource, "pkg://tests.test_apps.config_source_test.dir", id="ImportlibResourcesConfigSource", ),