Exemple #1
0
    def __call__(
        self,
        sweep_overrides: List[str],
        job_dir_key: str,
        job_num: int,
        job_id: str,
        singleton_state: Dict[type, "Singleton"],
    ):
        Singleton.set_state(singleton_state)
        configure_log(self.config.hydra.job_logging, self.config.hydra.verbose)
        setup_globals()
        sweep_config = self.config_loader.load_sweep_config(
            self.config, sweep_overrides)
        with open_dict(sweep_config.hydra.job) as job:
            # Populate new job variables
            if "SLURM_JOB_ID" in os.environ:
                job.id = os.environ["SLURM_JOB_ID"]
            else:
                job.id = job_id
            sweep_config.hydra.job.num = job_num

        return run_job(
            config=sweep_config,
            task_function=self.task_function,
            job_dir_key=job_dir_key,
            job_subdir_key="hydra.sweep.subdir",
        )
Exemple #2
0
def launch_jobs(temp_dir: str) -> None:
    runs = []
    with open(os.path.join(temp_dir, JOB_SPEC_PICKLE), "rb") as f:
        job_spec = pickle.load(f)  # nosec
        singleton_state = job_spec["singleton_state"]
        sweep_configs = job_spec["sweep_configs"]
        task_function = job_spec["task_function"]

        instance_id = _get_instance_id()

        sweep_dir = None

        for sweep_config in sweep_configs:
            with open_dict(sweep_config):
                sweep_config.hydra.job.id = (
                    f"{instance_id}_{sweep_config.hydra.job.num}"
                )
            setup_globals()
            Singleton.set_state(singleton_state)
            HydraConfig.instance().set_config(sweep_config)
            ray_init_cfg = sweep_config.hydra.launcher.ray_init_cfg
            ray_remote_cfg = sweep_config.hydra.launcher.ray_remote_cfg

            if not sweep_dir:
                sweep_dir = Path(str(HydraConfig.get().sweep.dir))
                sweep_dir.mkdir(parents=True, exist_ok=True)

            start_ray(ray_init_cfg)
            ray_obj = launch_job_on_ray(
                ray_remote_cfg, sweep_config, task_function, singleton_state
            )
            runs.append(ray_obj)

    result = [ray.get(run) for run in runs]
    _dump_job_return(result, temp_dir)
    def __call__(
        self,
        sweep_overrides: List[str],
        job_dir_key: str,
        job_num: int,
        job_id: str,
        singleton_state: Dict[type, Singleton],
    ) -> JobReturn:
        # lazy import to ensure plugin discovery remains fast
        import submitit

        assert self.config_loader is not None
        assert self.config is not None
        assert self.task_function is not None

        Singleton.set_state(singleton_state)
        setup_globals()
        sweep_config = self.config_loader.load_sweep_config(
            self.config, sweep_overrides)

        with open_dict(sweep_config.hydra.job) as job:
            # Populate new job variables
            job.id = submitit.JobEnvironment().job_id  # type: ignore
            sweep_config.hydra.job.num = job_num

        return run_job(
            config=sweep_config,
            task_function=self.task_function,
            job_dir_key=job_dir_key,
            job_subdir_key="hydra.sweep.subdir",
        )
Exemple #4
0
    def launch(self, job_overrides: Sequence[Sequence[str]],
               initial_job_idx: int) -> Sequence[JobReturn]:
        setup_globals()
        assert self.config is not None
        assert self.task_function is not None
        assert self.config_loader is not None

        configure_log(self.config.hydra.hydra_logging,
                      self.config.hydra.verbose)
        sweep_dir = self.config.hydra.sweep.dir
        Path(str(sweep_dir)).mkdir(parents=True, exist_ok=True)
        log.info(f"Launching {len(job_overrides)} jobs locally")
        runs: List[JobReturn] = []
        for idx, overrides in enumerate(job_overrides):
            idx = initial_job_idx + idx
            lst = " ".join(filter_overrides(overrides))
            log.info(f"\t#{idx} : {lst}")
            sweep_config = self.config_loader.load_sweep_config(
                self.config, list(overrides))
            with open_dict(sweep_config):
                sweep_config.hydra.job.id = idx
                sweep_config.hydra.job.num = idx
            ret = run_job(
                config=sweep_config,
                task_function=self.task_function,
                job_dir_key="hydra.sweep.dir",
                job_subdir_key="hydra.sweep.subdir",
            )
            runs.append(ret)
            configure_log(self.config.hydra.hydra_logging,
                          self.config.hydra.verbose)
        return runs
Exemple #5
0
def execute_job(
    idx: int,
    overrides: Sequence[str],
    config_loader: ConfigLoader,
    config: DictConfig,
    task_function: TaskFunction,
    singleton_state: Dict[Any, Any],
) -> JobReturn:
    """Calls `run_job` in parallel
    """
    setup_globals()
    Singleton.set_state(singleton_state)

    sweep_config = config_loader.load_sweep_config(config, list(overrides))
    with open_dict(sweep_config):
        sweep_config.hydra.job.id = "{}_{}".format(sweep_config.hydra.job.name,
                                                   idx)
        sweep_config.hydra.job.num = idx
    HydraConfig.instance().set_config(sweep_config)

    ret = run_job(
        config=sweep_config,
        task_function=task_function,
        job_dir_key="hydra.sweep.dir",
        job_subdir_key="hydra.sweep.subdir",
    )

    return ret
    def test_sweep_config_cache(self, hydra_restore_singletons: Any, path: str,
                                monkeypatch: Any) -> None:
        setup_globals()

        config_loader = ConfigLoaderImpl(
            config_search_path=create_config_search_path(path))
        master_cfg = config_loader.load_configuration(
            config_name="config.yaml",
            strict=False,
            overrides=["time='${now:%H-%M-%S}'", "home='${env:HOME}'"],
            run_mode=RunMode.RUN,
        )

        # trigger resolution by type assertion
        assert type(master_cfg.time) == str
        assert type(master_cfg.home) == str

        master_cfg_cache = OmegaConf.get_cache(master_cfg)
        assert "now" in master_cfg_cache.keys(
        ) and "env" in master_cfg_cache.keys()
        assert master_cfg.home == os.environ["HOME"]

        sweep_cfg = config_loader.load_sweep_config(
            master_config=master_cfg,
            sweep_overrides=["time='${now:%H-%M-%S}'", "home='${env:HOME}'"],
        )

        sweep_cfg_cache = OmegaConf.get_cache(sweep_cfg)
        assert len(
            sweep_cfg_cache.keys()) == 1 and "now" in sweep_cfg_cache.keys()
        assert sweep_cfg_cache["now"] == master_cfg_cache["now"]
        monkeypatch.setenv("HOME", "/another/home/dir/")
        assert sweep_cfg.home == os.getenv("HOME")
Exemple #7
0
    def launch(self,
               job_overrides: Sequence[Sequence[str]]) -> Sequence[JobReturn]:
        setup_globals()
        assert self.config is not None
        assert self.task_function is not None
        assert self.config_loader is not None

        configure_log(self.config.hydra.hydra_logging,
                      self.config.hydra.verbose)
        sweep_dir = self.config.hydra.sweep.dir
        Path(str(sweep_dir)).mkdir(parents=True, exist_ok=True)
        log.info("Launching {} jobs locally".format(len(job_overrides)))
        runs: List[JobReturn] = []

        for idx, overrides in enumerate(job_overrides):
            log.info("\t#{} : {}".format(idx, " ".join(
                filter_overrides(overrides))))
            sweep_config = self.config_loader.load_sweep_config(
                self.config, list(overrides))
            with open_dict(sweep_config):
                sweep_config.hydra.job.id = idx
                sweep_config.hydra.job.num = idx
            HydraConfig.instance().set_config(sweep_config)
            ret = run_job(
                config=sweep_config,
                task_function=self.task_function,
                job_dir_key="hydra.sweep.dir",
                job_subdir_key="hydra.sweep.subdir",
            )
            runs.append(ret)
            configure_log(self.config.hydra.hydra_logging,
                          self.config.hydra.verbose)
        return runs
Exemple #8
0
def dispatch_job(
    idx: int,
    overrides: Sequence[str],
    config_loader: ConfigLoader,
    config: DictConfig,
    task_function: TaskFunction,
    singleton_state: Dict[Any, Any],
) -> JobReturn:
    """Calls `run_job` in parallel

    Note that Joblib's default backend runs isolated Python processes, see
    https://joblib.readthedocs.io/en/latest/parallel.html#shared-memory-semantics
    """
    setup_globals()
    Singleton.set_state(singleton_state)

    log.info("\t#{} : {}".format(idx, " ".join(filter_overrides(overrides))))
    sweep_config = config_loader.load_sweep_config(config, list(overrides))
    with open_dict(sweep_config):
        sweep_config.hydra.job.id = "{}_{}".format(sweep_config.hydra.job.name,
                                                   idx)
        sweep_config.hydra.job.num = idx
    HydraConfig.instance().set_config(sweep_config)

    ret = run_job(
        config=sweep_config,
        task_function=task_function,
        job_dir_key="hydra.sweep.dir",
        job_subdir_key="hydra.sweep.subdir",
    )

    return ret
Exemple #9
0
    def launch(
        self, job_overrides: Sequence[Sequence[str]], initial_job_idx: int
    ) -> Sequence[JobReturn]:
        setup_globals()
        assert self.hydra_context is not None
        assert self.config is not None
        assert self.task_function is not None

        configure_log(self.config.hydra.hydra_logging, self.config.hydra.verbose)
        sweep_dir = self.config.hydra.sweep.dir
        Path(str(sweep_dir)).mkdir(parents=True, exist_ok=True)
        log.info("Launching {} jobs on slurm".format(len(job_overrides)))
        runs: List[JobReturn] = []
        for idx, overrides in enumerate(job_overrides):
            idx = initial_job_idx + idx
            lst = " ".join(filter_overrides(overrides))
            log.info(f"\t#{idx} : {lst}")
            sweep_config = self.hydra_context.config_loader.load_sweep_config(
                self.config, list(overrides)
            )

            with open_dict(sweep_config):
                sweep_config.hydra.job.id = idx
                sweep_config.hydra.job.num = idx
            HydraConfig.instance().set_config(sweep_config)
            log.info("\tJob name : {}".format(slurm_utils.resolve_name(sweep_config.slurm.job_name)))

            slurm_utils.write_slurm(sweep_config)
            slurm_utils.write_sh(sweep_config, " ".join(filter_overrides(overrides)))
            slurm_utils.launch_job(sweep_config)

            configure_log(self.config.hydra.hydra_logging, self.config.hydra.verbose)
            if sweep_config.wait:
                time.sleep(1)
        return runs
Exemple #10
0
    def launch(self, job_overrides: Sequence[Sequence[str]],
               initial_job_idx: int) -> Sequence[JobReturn]:
        """
        :param job_overrides: a List of List<String>, where each inner list is the arguments for one job run.
        :param initial_job_idx: Initial job idx in batch.
        :return: an array of return values from run_job with indexes corresponding to the input list indexes.
        """
        setup_globals()
        assert self.config is not None
        assert self.hydra_context is not None
        assert self.task_function is not None

        configure_log(self.config.hydra.hydra_logging,
                      self.config.hydra.verbose)
        sweep_dir = Path(str(self.config.hydra.sweep.dir))
        sweep_dir.mkdir(parents=True, exist_ok=True)
        log.info(
            f"Example Launcher(foo={self.foo}, bar={self.bar}) is launching {len(job_overrides)} jobs locally"
        )
        log.info(f"Sweep output dir : {sweep_dir}")
        runs = []

        for idx, overrides in enumerate(job_overrides):
            idx = initial_job_idx + idx
            lst = " ".join(filter_overrides(overrides))
            log.info(f"\t#{idx} : {lst}")
            sweep_config = self.hydra_context.config_loader.load_sweep_config(
                self.config, list(overrides))
            with open_dict(sweep_config):
                # This typically coming from the underlying scheduler (SLURM_JOB_ID for instance)
                # In that case, it will not be available here because we are still in the main process.
                # but instead should be populated remotely before calling the task_function.
                sweep_config.hydra.job.id = f"job_id_for_{idx}"
                sweep_config.hydra.job.num = idx

            # If your launcher is executing code in a different process, it is important to restore
            # the singleton state in the new process.
            # To do this, you will likely need to serialize the singleton state along with the other
            # parameters passed to the child process.

            # happening on this process (executing launcher)
            state = Singleton.get_state()

            # happening on the spawned process (executing task_function in run_job)
            Singleton.set_state(state)

            ret = run_job(
                hydra_context=self.hydra_context,
                task_function=self.task_function,
                config=sweep_config,
                job_dir_key="hydra.sweep.dir",
                job_subdir_key="hydra.sweep.subdir",
            )
            runs.append(ret)
            # reconfigure the logging subsystem for Hydra as the run_job call configured it for the Job.
            # This is needed for launchers that calls run_job in the same process and not spawn a new one.
            configure_log(self.config.hydra.hydra_logging,
                          self.config.hydra.verbose)
        return runs
Exemple #11
0
def launch(
    launcher: RayAWSLauncher,
    job_overrides: Sequence[Sequence[str]],
    initial_job_idx: int,
) -> Sequence[JobReturn]:
    setup_globals()
    assert launcher.config is not None
    assert launcher.config_loader is not None
    assert launcher.task_function is not None

    setup_commands = launcher.env_setup.commands
    with read_write(setup_commands):
        setup_commands.extend([
            f"pip install {package}=={version}"
            for package, version in launcher.env_setup.pip_packages.items()
        ])
        setup_commands.extend(launcher.ray_cfg.cluster.setup_commands)

    with read_write(launcher.ray_cfg.cluster):
        launcher.ray_cfg.cluster.setup_commands = setup_commands

    configure_log(launcher.config.hydra.hydra_logging,
                  launcher.config.hydra.verbose)

    log.info(f"Ray Launcher is launching {len(job_overrides)} jobs, ")

    with tempfile.TemporaryDirectory() as local_tmp_dir:
        sweep_configs = []
        for idx, overrides in enumerate(job_overrides):
            idx = initial_job_idx + idx
            ostr = " ".join(filter_overrides(overrides))
            log.info(f"\t#{idx} : {ostr}")
            sweep_config = launcher.config_loader.load_sweep_config(
                launcher.config, list(overrides))
            with open_dict(sweep_config):
                # job.id will be set on the EC2 instance before running the job.
                sweep_config.hydra.job.num = idx

            sweep_configs.append(sweep_config)

        _pickle_jobs(
            tmp_dir=local_tmp_dir,
            sweep_configs=sweep_configs,  # type: ignore
            task_function=launcher.task_function,
            singleton_state=Singleton.get_state(),
        )

        with tempfile.NamedTemporaryFile(suffix=".yaml", delete=False) as f:
            with open(f.name, "w") as file:
                OmegaConf.save(config=launcher.ray_cfg.cluster,
                               f=file.name,
                               resolve=True)
            launcher.ray_yaml_path = f.name
            log.info(
                f"Saving RayClusterConf in a temp yaml file: {launcher.ray_yaml_path}."
            )

            return launch_jobs(launcher, local_tmp_dir,
                               Path(HydraConfig.get().sweep.dir))
Exemple #12
0
def test_py_version_resolver(hydra_restore_singletons: Any,
                             monkeypatch: Any) -> Any:
    monkeypatch.setattr(sys, "version_info", (3, 7, 2))
    utils.setup_globals()
    assert OmegaConf.create({"key": "${python_version:}"}).key == "3.7"
    assert OmegaConf.create({"key": "${python_version:major}"}).key == "3"
    assert OmegaConf.create({"key": "${python_version:minor}"}).key == "3.7"
    assert OmegaConf.create({"key": "${python_version:micro}"}).key == "3.7.2"
Exemple #13
0
 def __init__(self, task_name: str, config_loader: ConfigLoader) -> None:
     """
     :param task_name: task name
     :param config_loader: config loader
     """
     setup_globals()
     self.config_loader = config_loader
     JobRuntime().set("name", task_name)
Exemple #14
0
def launch(
    launcher: RayAWSLauncher,
    job_overrides: Sequence[Sequence[str]],
    initial_job_idx: int,
) -> Sequence[JobReturn]:
    setup_globals()
    assert launcher.config is not None
    assert launcher.hydra_context is not None
    assert launcher.task_function is not None

    setup_commands = launcher.env_setup.commands
    packages = filter(
        lambda x: x[1] is not None, launcher.env_setup.pip_packages.items()
    )
    with read_write(setup_commands):
        setup_commands.extend(
            [f"pip install {package}=={version}" for package, version in packages]
        )
        setup_commands.extend(launcher.ray_cfg.cluster.setup_commands)

    with read_write(launcher.ray_cfg.cluster):
        launcher.ray_cfg.cluster.setup_commands = setup_commands

    configure_log(launcher.config.hydra.hydra_logging, launcher.config.hydra.verbose)
    logging_config = OmegaConf.to_container(
        launcher.logging, resolve=True, enum_to_str=True
    )
    sdk.configure_logging(**logging_config)

    log.info(f"Ray Launcher is launching {len(job_overrides)} jobs, ")

    with tempfile.TemporaryDirectory() as local_tmp_dir:
        sweep_configs = []
        for idx, overrides in enumerate(job_overrides):
            idx = initial_job_idx + idx
            ostr = " ".join(filter_overrides(overrides))
            log.info(f"\t#{idx} : {ostr}")
            sweep_config = launcher.hydra_context.config_loader.load_sweep_config(
                launcher.config, list(overrides)
            )
            with open_dict(sweep_config):
                # job.id will be set on the EC2 instance before running the job.
                sweep_config.hydra.job.num = idx

            sweep_configs.append(sweep_config)

        _pickle_jobs(
            tmp_dir=local_tmp_dir,
            hydra_context=launcher.hydra_context,
            sweep_configs=sweep_configs,  # type: ignore
            task_function=launcher.task_function,
            singleton_state=Singleton.get_state(),
        )
        return launch_jobs(
            launcher, local_tmp_dir, Path(launcher.config.hydra.sweep.dir)
        )
Exemple #15
0
def launch(
    launcher: JoblibLauncher,
    job_overrides: Sequence[Sequence[str]],
    initial_job_idx: int,
) -> Sequence[JobReturn]:
    """
    :param job_overrides: a List of List<String>, where each inner list is the arguments for one job run.
    :param initial_job_idx: Initial job idx in batch.
    :return: an array of return values from run_job with indexes corresponding to the input list indexes.
    """
    setup_globals()
    assert launcher.config is not None
    assert launcher.task_function is not None
    assert launcher.hydra_context is not None

    configure_log(launcher.config.hydra.hydra_logging, launcher.config.hydra.verbose)
    sweep_dir = Path(str(launcher.config.hydra.sweep.dir))
    sweep_dir.mkdir(parents=True, exist_ok=True)

    # Joblib's backend is hard-coded to loky since the threading
    # backend is incompatible with Hydra
    joblib_cfg = launcher.joblib
    joblib_cfg["backend"] = "loky"
    process_joblib_cfg(joblib_cfg)

    log.info(
        "Joblib.Parallel({}) is launching {} jobs".format(
            ",".join([f"{k}={v}" for k, v in joblib_cfg.items()]),
            len(job_overrides),
        )
    )
    log.info("Launching jobs, sweep output dir : {}".format(sweep_dir))
    for idx, overrides in enumerate(job_overrides):
        log.info("\t#{} : {}".format(idx, " ".join(filter_overrides(overrides))))

    singleton_state = Singleton.get_state()

    runs = Parallel(**joblib_cfg)(
        delayed(execute_job)(
            initial_job_idx + idx,
            overrides,
            launcher.hydra_context,
            launcher.config,
            launcher.task_function,
            singleton_state,
        )
        for idx, overrides in enumerate(job_overrides)
    )

    assert isinstance(runs, List)
    for run in runs:
        assert isinstance(run, JobReturn)
    return runs
Exemple #16
0
def test_foo(restore_singletons: Any) -> Any:
    utils.setup_globals()

    config_loader = ConfigLoaderImpl(
        config_search_path=create_config_search_path(
            "pkg://hydra.test_utils.configs"))
    cfg = config_loader.load_configuration(
        config_name="accessing_hydra_config", overrides=[])
    HydraConfig.instance().set_config(cfg)
    with open_dict(cfg):
        del cfg["hydra"]
    assert cfg.job_name == "UNKNOWN_NAME"
    assert cfg.config_name == "accessing_hydra_config"
Exemple #17
0
def _run_job(
    sweep_config: DictConfig,
    task_function: TaskFunction,
    singleton_state: Dict[Any, Any],
) -> JobReturn:
    setup_globals()
    Singleton.set_state(singleton_state)
    HydraConfig.instance().set_config(sweep_config)
    return run_job(
        config=sweep_config,
        task_function=task_function,
        job_dir_key="hydra.sweep.dir",
        job_subdir_key="hydra.sweep.subdir",
    )
Exemple #18
0
    def launch(self, job_overrides: Sequence[Sequence[str]],
               initial_job_idx: int) -> Sequence[JobReturn]:
        """
        :param job_overrides: a List of List<String>, where each inner list is the arguments for one job run.
        :param initial_job_idx: Initial job idx in batch.
        :return: an array of return values from run_job with indexes corresponding to the input list indexes.
        """
        setup_globals()
        assert self.config is not None
        assert self.config_loader is not None
        assert self.task_function is not None

        configure_log(self.config.hydra.hydra_logging,
                      self.config.hydra.verbose)
        sweep_dir = Path(str(self.config.hydra.sweep.dir))
        sweep_dir.mkdir(parents=True, exist_ok=True)
        log.info(
            f"TaskSpooler Launcher is launching {len(job_overrides)} jobs locally"
        )
        log.info(f"Sweep output dir : {sweep_dir}")
        runs = []
        singleton_state = Singleton.get_state()

        for idx, overrides in enumerate(job_overrides):
            overrides = list(overrides)
            overrides.extend(self.hydra_overrides)
            overrides = tuple(overrides)

            ret = execute_job(
                initial_job_idx + idx,
                overrides,
                self.config_loader,
                self.config,
                self.task_function,
                singleton_state,
                self.cmd_prefix,
                self.tsp_prefix,
            )
            runs.append(ret)
            time.sleep(self.time_between_submit)

        assert isinstance(runs, List)
        for run in runs:
            assert isinstance(run, JobReturn)

        if self.tail_jobs:
            Parallel(n_jobs=len(job_overrides), backend='threading')(
                delayed(self.tail_job)(run.return_value) for run in runs)
        return runs
Exemple #19
0
    def launch(self, job_overrides: Sequence[Sequence[str]],
               initial_job_idx: int) -> Sequence[JobReturn]:
        """
        :param job_overrides: a List of List<String>, where each inner list is the arguments for one job run.
        :param initial_job_idx: Initial job idx in batch.
        :return: an array of return values from run_job with indexes corresponding to the input list indexes.
        """
        setup_globals()
        assert self.config is not None
        assert self.config_loader is not None
        assert self.task_function is not None

        configure_log(self.config.hydra.hydra_logging,
                      self.config.hydra.verbose)
        sweep_dir = Path(str(self.config.hydra.sweep.dir))
        sweep_dir.mkdir(parents=True, exist_ok=True)
        log.info(
            "Example Launcher(foo={}, bar={}) is launching {} jobs locally".
            format(self.foo, self.bar, len(job_overrides)))
        log.info("Sweep output dir : {}".format(sweep_dir))
        runs = []

        for idx, overrides in enumerate(job_overrides):
            idx = initial_job_idx + idx
            log.info("\t#{} : {}".format(idx, " ".join(
                filter_overrides(overrides))))
            sweep_config = self.config_loader.load_sweep_config(
                self.config, list(overrides))
            with open_dict(sweep_config):
                # This typically coming from the underlying scheduler (SLURM_JOB_ID for instance)
                # In that case, it will not be available here because we are still in the main process.
                # but instead should be populated remotely before calling the task_function.
                sweep_config.hydra.job.id = "job_id_for_{}".format(idx)
                sweep_config.hydra.job.num = idx
            HydraConfig.instance().set_config(sweep_config)

            ret = run_job(
                config=sweep_config,
                task_function=self.task_function,
                job_dir_key="hydra.sweep.dir",
                job_subdir_key="hydra.sweep.subdir",
            )
            runs.append(ret)
            # reconfigure the logging subsystem for Hydra as the run_job call configured it for the Job.
            # This is needed for launchers that calls run_job in the same process and not spawn a new one.
            configure_log(self.config.hydra.hydra_logging,
                          self.config.hydra.verbose)
        return runs
Exemple #20
0
    def __init__(self, task_name: str, config_loader: ConfigLoader) -> None:
        """
        :param task_name: task name
        :param config_loader: config loader
        """
        setup_globals()
        self.config_loader = config_loader

        for source in config_loader.get_sources():
            # if specified, make sure main config search path exists
            if source.provider == "main":
                if not source.exists(""):
                    raise MissingConfigException(
                        missing_cfg_file=source.path,
                        message=f"Primary config dir not found: {source}",
                    )

        JobRuntime().set("name", task_name)
Exemple #21
0
def execute_job(
    idx: int,
    overrides: Sequence[str],
    config_loader: ConfigLoader,
    config: DictConfig,
    task_function: TaskFunction,
    singleton_state: Dict[Any, Any],
    cmd_prefix: str,
    tsp_prefix: str,
) -> JobReturn:
    """Calls `run_job` in parallel
    """
    setup_globals()
    Singleton.set_state(singleton_state)

    lst = " ".join(overrides)

    sweep_config = config_loader.load_sweep_config(config, list(overrides))
    with open_dict(sweep_config):
        sweep_config.hydra.job.id = "{}_{}".format(sweep_config.hydra.job.name,
                                                   idx)
        sweep_config.hydra.job.num = idx
    HydraConfig.instance().set_config(sweep_config)

    def tsp_task_function(task_cfg):
        working_dir = os.getcwd()
        cmd = f"{cmd_prefix} {lst}"
        log.info(f"\t#{idx} : {lst}")
        cmd = f"cd {hydra.utils.get_original_cwd()} && {cmd} hydra.run.dir={working_dir}"
        job_id = int(subprocess.check_output(cmd, shell=True).rstrip())
        log.info(
            f"Submitted {idx} to TaskSpooler. View logs: {tsp_prefix} -t {job_id}"
        )
        return job_id

    ret = run_job(
        config=sweep_config,
        task_function=tsp_task_function,
        job_dir_key="hydra.sweep.dir",
        job_subdir_key="hydra.sweep.subdir",
    )
    ret.id = ret.return_value

    return ret
Exemple #22
0
def launch(
    launcher: RayLocalLauncher,
    job_overrides: Sequence[Sequence[str]],
    initial_job_idx: int,
) -> Sequence[JobReturn]:
    setup_globals()
    assert launcher.config is not None
    assert launcher.config_loader is not None
    assert launcher.task_function is not None

    configure_log(launcher.config.hydra.hydra_logging, launcher.config.hydra.verbose)
    sweep_dir = Path(str(launcher.config.hydra.sweep.dir))
    sweep_dir.mkdir(parents=True, exist_ok=True)
    log.info(
        f"Ray Launcher is launching {len(job_overrides)} jobs, "
        f"sweep output dir: {sweep_dir}"
    )

    start_ray(launcher.ray_init_cfg)

    runs = []
    for idx, overrides in enumerate(job_overrides):
        idx = initial_job_idx + idx
        ostr = " ".join(filter_overrides(overrides))
        log.info(f"\t#{idx} : {ostr}")
        sweep_config = launcher.config_loader.load_sweep_config(
            launcher.config, list(overrides)
        )
        with open_dict(sweep_config):
            # This typically coming from the underlying scheduler (SLURM_JOB_ID for instance)
            # In that case, it will not be available here because we are still in the main process.
            # but instead should be populated remotely before calling the task_function.
            sweep_config.hydra.job.id = f"job_id_for_{idx}"
            sweep_config.hydra.job.num = idx
            ray_obj = launch_job_on_ray(
                launcher.ray_remote_cfg,
                sweep_config,
                launcher.task_function,
                Singleton.get_state(),
            )
            runs.append(ray_obj)

    return [ray.get(run) for run in runs]
Exemple #23
0
    def launch(self,
               job_overrides: Sequence[Sequence[str]]) -> Sequence[JobReturn]:
        """
        :param job_overrides: a List of List<String>, where each inner list is the arguments for one job run.
        :return: an array of return values from run_job with indexes corresponding to the input list indexes.
        """
        setup_globals()
        assert self.config is not None
        assert self.config_loader is not None
        assert self.task_function is not None

        configure_log(self.config.hydra.hydra_logging,
                      self.config.hydra.verbose)
        sweep_dir = Path(str(self.config.hydra.sweep.dir))
        sweep_dir.mkdir(parents=True, exist_ok=True)
        log.info("Joblib.Parallel({}) is launching {} jobs".format(
            ",".join([f"{k}={v}" for k, v in self.joblib.items()]),
            len(job_overrides),
        ))
        log.info("Launching jobs, sweep output dir : {}".format(sweep_dir))

        singleton_state = Singleton.get_state()

        for idx, overrides in enumerate(job_overrides):
            log.info("\t#{} : {}".format(idx, " ".join(
                filter_overrides(overrides))))

        runs = Parallel(**self.joblib)(delayed(execute_job)(
            idx,
            overrides,
            self.config_loader,
            self.config,
            self.task_function,
            singleton_state,
        ) for idx, overrides in enumerate(job_overrides))

        assert isinstance(runs, List)
        for run in runs:
            assert isinstance(run, JobReturn)
        return runs
Exemple #24
0
    def test_sweep_config_cache(self, hydra_restore_singletons: Any, path: str,
                                monkeypatch: Any) -> None:
        setup_globals()

        monkeypatch.setenv("TEST_ENV", "test_env")

        config_loader = ConfigLoaderImpl(
            config_search_path=create_config_search_path(path))
        master_cfg = config_loader.load_configuration(
            config_name="config.yaml",
            overrides=[
                "+time=${now:%H-%M-%S}", "+test_env=${oc.env:TEST_ENV}"
            ],
            run_mode=RunMode.RUN,
        )

        # trigger resolution by type assertion
        assert type(master_cfg.time) == str
        assert type(master_cfg.test_env) == str

        master_cfg_cache = OmegaConf.get_cache(master_cfg)
        assert "now" in master_cfg_cache.keys()
        # oc.env is not cached as of OmegaConf 2.1
        assert "oc.env" not in master_cfg_cache.keys()
        assert master_cfg.test_env == "test_env"

        sweep_cfg = config_loader.load_sweep_config(
            master_config=master_cfg,
            sweep_overrides=[
                "+time=${now:%H-%M-%S}", "+test_env=${oc.env:TEST_ENV}"
            ],
        )

        sweep_cfg_cache = OmegaConf.get_cache(sweep_cfg)
        assert len(
            sweep_cfg_cache.keys()) == 1 and "now" in sweep_cfg_cache.keys()
        assert sweep_cfg_cache["now"] == master_cfg_cache["now"]
        monkeypatch.setenv("TEST_ENV", "test_env2")
        assert sweep_cfg.test_env == "test_env2"
Exemple #25
0
    def launch(self, job_overrides: Sequence[Sequence[str]],
               initial_job_idx: int) -> Sequence[JobReturn]:
        """Implementation of Launcher.launch

        :param job_overrides: a List of List<String>, where each inner list is the arguments for one job run.
        :param initial_job_idx: Initial job idx in batch.
        :return: an array of return values from run_job with indexes corresponding to the input list indexes.
        """
        setup_globals()
        assert self.config is not None
        assert self.config_loader is not None
        assert self.task_function is not None

        configure_log(self.config.hydra.hydra_logging,
                      self.config.hydra.verbose)
        sweep_dir = self.config.hydra.sweep.dir
        Path(str(sweep_dir)).mkdir(parents=True, exist_ok=True)

        logger.info(
            f"Local Launcher is launching {len(job_overrides)} jobs locally")
        logger.info(f"Launching jobs, sweep output dir : {sweep_dir}")
        for idx, overrides in enumerate(job_overrides):
            logger.info("\t#{} : {}".format(
                idx, " ".join(filter_overrides(overrides))))

        results = []
        workers = []
        for i, overrides in enumerate(job_overrides):
            idx = initial_job_idx + i
            lst = " ".join(filter_overrides(overrides))
            logger.info(f"\t#{idx} : {lst}")

            sweep_config = self.config_loader.load_sweep_config(
                self.config, list(overrides))
            with open_dict(sweep_config):
                sweep_config.hydra.job.id = f"job_id_for_{idx}"
                sweep_config.hydra.job.num = idx

            p = Process(target=run_job,
                        kwargs=dict(config=sweep_config,
                                    task_function=self.task_function,
                                    job_dir_key="hydra.sweep.dir",
                                    job_subdir_key="hydra.sweep.subdir"))
            p.start()
            workers.append(p)

            # wait for current/last batch of workers
            if ((i + 1) % self._n_jobs == 0) or ((i + 1)
                                                 == len(job_overrides)):
                for w in workers:
                    w.join()

                    # forward exceptions from the workers
                    if w.exception():
                        raise w.exception()

                # book keeping
                results.extend([p.result() for p in workers])
                workers = []

        assert len(results) == len(job_overrides)
        return results
Exemple #26
0
def launch(
    launcher: RQLauncher,
    job_overrides: Sequence[Sequence[str]],
    initial_job_idx: int,
) -> JobReturn:
    """
    :param job_overrides: a List of List<String>, where each inner list is the arguments for one job run.
    :param initial_job_idx: Initial job idx in batch.
    :return: an array of return values from run_job with indexes corresponding to the input list indexes.
    """
    setup_globals()
    assert launcher.config is not None
    assert launcher.config_loader is not None
    assert launcher.task_function is not None

    configure_log(launcher.config.hydra.hydra_logging,
                  launcher.config.hydra.verbose)
    sweep_dir = Path(str(launcher.config.hydra.sweep.dir))
    sweep_dir.mkdir(parents=True, exist_ok=True)

    # RQ configuration
    rq_cfg = launcher.rq

    # Redis configuration
    is_async = not rq_cfg.redis.mock
    if is_async:
        connection = Redis(
            host=rq_cfg.redis.host,
            port=rq_cfg.redis.port,
            db=rq_cfg.redis.db,
            password=rq_cfg.redis.password,
        )
    else:
        log.info("Running in synchronous mode")
        connection = FakeStrictRedis()
    queue = Queue(
        name=rq_cfg.queue,
        connection=connection,
        is_async=is_async,
        serializer=cloudpickle,
    )

    # Enqueue jobs
    jobs = []
    singleton_state = Singleton.get_state()
    log.info(
        f"RQ Launcher is enqueuing {len(job_overrides)} job(s) in queue : {rq_cfg.queue}"
    )
    log.info("Sweep output dir : {}".format(sweep_dir))
    if not sweep_dir.is_absolute():
        log.warn(
            "Using relative sweep dir: Please be aware that dir will be relative to where workers are started from."
        )

    for idx, overrides in enumerate(job_overrides):
        description = " ".join(filter_overrides(overrides))

        enqueue_keywords = OmegaConf.to_container(rq_cfg.enqueue, resolve=True)
        if enqueue_keywords["job_timeout"] is None:
            enqueue_keywords["job_timeout"] = -1
        if enqueue_keywords["result_ttl"] is None:
            enqueue_keywords["result_ttl"] = -1
        if enqueue_keywords["failure_ttl"] is None:
            enqueue_keywords["failure_ttl"] = -1
        if enqueue_keywords["job_id"] is None:
            enqueue_keywords["job_id"] = str(uuid.uuid4())
        if enqueue_keywords["description"] is None:
            enqueue_keywords["description"] = description

        sweep_config = launcher.config_loader.load_sweep_config(
            launcher.config, list(overrides))
        with open_dict(sweep_config):
            sweep_config.hydra.job.id = enqueue_keywords["job_id"]
            sweep_config.hydra.job.num = initial_job_idx + idx

        job = queue.enqueue(
            execute_job,
            sweep_config=sweep_config,
            task_function=launcher.task_function,
            singleton_state=singleton_state,
            **enqueue_keywords,
        )
        jobs.append(job)

        log.info(f"Enqueued {job.get_id()}")
        log.info(f"\t#{idx+1} : {description}")

    log.info("Finished enqueuing")
    if rq_cfg.stop_after_enqueue:
        raise StopAfterEnqueue

    log.info(f"Polling job statuses every {rq_cfg.wait_polling} sec")
    while True:
        job_ids_done = [
            job.get_id() for job in jobs
            if job.get_status() in ["finished", "failed"]
        ]
        if len(job_ids_done) == len(jobs):
            break
        else:
            time.sleep(rq_cfg.wait_polling)

    runs = []
    for job in jobs:
        result = job.result if job.result is not None else None
        runs.append(result)

    assert isinstance(runs, List)
    for run in runs:
        assert isinstance(run, JobReturn)

    return runs