Exemple #1
0
def setup_multirun():
    from hydra.core.hydra_config import HydraConfig
    if 'num' in HydraConfig.get().job:
        job_num = HydraConfig.get().job.num % torch.cuda.device_count()
        gpu = job_num % torch.cuda.device_count()
        torch.cuda.set_device(gpu)
        print(f'Job number {job_num:2d}')
        print(f'Setting active GPU to {gpu}')
Exemple #2
0
def test_configuration_set_via_cmd_and_default_config(
    sweep_runner: TSweepRunner, ) -> None:
    sweep = sweep_runner(
        calling_file="tests/test_ax_sweeper_plugin.py",
        calling_module=None,
        task_function=quadratic,
        config_path="config",
        config_name="config.yaml",
        overrides=[
            "hydra/launcher=basic",
            "hydra.sweeper.params.ax_config.max_trials=2",
            "hydra.sweeper.params.ax_config.early_stop.max_epochs_without_improvement=2",
            "quadratic=basic",
            "quadratic.x=-5:-2",
            "quadratic.y=-1:1",
        ],
    )
    with sweep:
        ax_config = HydraConfig.get().sweeper.params.ax_config
        assert ax_config.max_trials == 2
        assert ax_config.early_stop.max_epochs_without_improvement == 2
        assert ax_config.experiment.minimize is True
        assert sweep.returns is None
        returns = OmegaConf.load(f"{sweep.temp_dir}/optimization_results.yaml")
        assert isinstance(returns, DictConfig)
        best_parameters = returns["ax"]
        assert "quadratic_x" in best_parameters
        assert "quadratic_y" in best_parameters
Exemple #3
0
def setup_globals() -> None:
    # please add documentation when you add a new resolver
    OmegaConf.register_new_resolver(
        "now",
        lambda pattern: datetime.now().strftime(pattern),
        use_cache=True,
        replace=True,
    )
    OmegaConf.register_new_resolver(
        "hydra",
        lambda path: OmegaConf.select(cast(DictConfig, HydraConfig.get()), path
                                      ),
        replace=True,
    )

    vi = sys.version_info
    version_dict = {
        "major": f"{vi[0]}",
        "minor": f"{vi[0]}.{vi[1]}",
        "micro": f"{vi[0]}.{vi[1]}.{vi[2]}",
    }
    OmegaConf.register_new_resolver(
        "python_version",
        lambda level="minor": version_dict.get(level),
        replace=True)
Exemple #4
0
def _get_job_id() -> int:
    try:
        return HydraConfig.get().job.id
    except MissingMandatoryValue:
        # The numeric job ID is missing if not in a multirun context. In that
        # case, there can only be a single run.
        return 0
Exemple #5
0
def launch_jobs(temp_dir: str) -> None:
    runs = []
    with open(os.path.join(temp_dir, JOB_SPEC_PICKLE), "rb") as f:
        job_spec = pickle.load(f)  # nosec
        singleton_state = job_spec["singleton_state"]
        sweep_configs = job_spec["sweep_configs"]
        task_function = job_spec["task_function"]

        instance_id = _get_instance_id()

        sweep_dir = None

        for sweep_config in sweep_configs:
            with open_dict(sweep_config):
                sweep_config.hydra.job.id = (
                    f"{instance_id}_{sweep_config.hydra.job.num}"
                )
            setup_globals()
            Singleton.set_state(singleton_state)
            HydraConfig.instance().set_config(sweep_config)
            ray_init_cfg = sweep_config.hydra.launcher.ray_init_cfg
            ray_remote_cfg = sweep_config.hydra.launcher.ray_remote_cfg

            if not sweep_dir:
                sweep_dir = Path(str(HydraConfig.get().sweep.dir))
                sweep_dir.mkdir(parents=True, exist_ok=True)

            start_ray(ray_init_cfg)
            ray_obj = launch_job_on_ray(
                ray_remote_cfg, sweep_config, task_function, singleton_state
            )
            runs.append(ray_obj)

    result = [ray.get(run) for run in runs]
    _dump_job_return(result, temp_dir)
Exemple #6
0
def launch(
    launcher: RayAWSLauncher,
    job_overrides: Sequence[Sequence[str]],
    initial_job_idx: int,
) -> Sequence[JobReturn]:
    setup_globals()
    assert launcher.config is not None
    assert launcher.config_loader is not None
    assert launcher.task_function is not None

    setup_commands = launcher.env_setup.commands
    with read_write(setup_commands):
        setup_commands.extend([
            f"pip install {package}=={version}"
            for package, version in launcher.env_setup.pip_packages.items()
        ])
        setup_commands.extend(launcher.ray_cfg.cluster.setup_commands)

    with read_write(launcher.ray_cfg.cluster):
        launcher.ray_cfg.cluster.setup_commands = setup_commands

    configure_log(launcher.config.hydra.hydra_logging,
                  launcher.config.hydra.verbose)

    log.info(f"Ray Launcher is launching {len(job_overrides)} jobs, ")

    with tempfile.TemporaryDirectory() as local_tmp_dir:
        sweep_configs = []
        for idx, overrides in enumerate(job_overrides):
            idx = initial_job_idx + idx
            ostr = " ".join(filter_overrides(overrides))
            log.info(f"\t#{idx} : {ostr}")
            sweep_config = launcher.config_loader.load_sweep_config(
                launcher.config, list(overrides))
            with open_dict(sweep_config):
                # job.id will be set on the EC2 instance before running the job.
                sweep_config.hydra.job.num = idx

            sweep_configs.append(sweep_config)

        _pickle_jobs(
            tmp_dir=local_tmp_dir,
            sweep_configs=sweep_configs,  # type: ignore
            task_function=launcher.task_function,
            singleton_state=Singleton.get_state(),
        )

        with tempfile.NamedTemporaryFile(suffix=".yaml", delete=False) as f:
            with open(f.name, "w") as file:
                OmegaConf.save(config=launcher.ray_cfg.cluster,
                               f=file.name,
                               resolve=True)
            launcher.ray_yaml_path = f.name
            log.info(
                f"Saving RayClusterConf in a temp yaml file: {launcher.ray_yaml_path}."
            )

            return launch_jobs(launcher, local_tmp_dir,
                               Path(HydraConfig.get().sweep.dir))
Exemple #7
0
def get_original_cwd() -> str:
    """
    :return: the original working directory the Hydra application was launched from
    """
    if not HydraConfig.initialized():
        raise ValueError(
            "get_original_cwd() must only be used after HydraConfig is initialized"
        )
    ret = HydraConfig.get().runtime.cwd
    assert ret is not None and isinstance(ret, str)
    return ret
Exemple #8
0
def run_job(
    config: DictConfig,
    task_function: TaskFunction,
    job_dir_key: str,
    job_subdir_key: Optional[str],
) -> "JobReturn":
    old_cwd = os.getcwd()
    working_dir = str(OmegaConf.select(config, job_dir_key))
    if job_subdir_key is not None:
        # evaluate job_subdir_key lazily.
        # this is running on the client side in sweep and contains things such as job:id which
        # are only available there.
        subdir = str(OmegaConf.select(config, job_subdir_key))
        working_dir = os.path.join(working_dir, subdir)
    try:
        ret = JobReturn()
        ret.working_dir = working_dir
        task_cfg = copy.deepcopy(config)
        with read_write(task_cfg):
            with open_dict(task_cfg):
                del task_cfg["hydra"]
        ret.cfg = task_cfg
        ret.hydra_cfg = OmegaConf.create({"hydra": HydraConfig.get()})
        overrides = OmegaConf.to_container(config.hydra.overrides.task)
        assert isinstance(overrides, list)
        ret.overrides = overrides
        # handle output directories here
        Path(str(working_dir)).mkdir(parents=True, exist_ok=True)
        os.chdir(working_dir)

        configure_log(config.hydra.job_logging, config.hydra.verbose)

        hydra_cfg = OmegaConf.masked_copy(config, "hydra")
        assert isinstance(hydra_cfg, DictConfig)

        if config.hydra.output_subdir is not None:
            hydra_output = Path(config.hydra.output_subdir)
            _save_config(task_cfg, "config.yaml", hydra_output)
            _save_config(hydra_cfg, "hydra.yaml", hydra_output)
            _save_config(config.hydra.overrides.task, "overrides.yaml",
                         hydra_output)

        with env_override(hydra_cfg.hydra.job.env_set):
            ret.return_value = task_function(task_cfg)
        ret.task_name = JobRuntime.instance().get("name")

        # shut down logging to ensure job log files are closed.
        # If logging is still required after run_job caller is responsible to re-initialize it.
        logging.shutdown()

        return ret
    finally:
        os.chdir(old_cwd)
Exemple #9
0
def my_app(cfg: DictConfig) -> str:
    def pickle_cfg(path: Path, obj: Any) -> Any:
        with open(str(path), "wb") as file:
            pickle.dump(obj, file)

    hydra_cfg = HydraConfig.get()
    output_dir = Path(hydra_cfg.runtime.output_dir)
    pickle_cfg(Path(output_dir) / "task_cfg.pickle", cfg)
    pickle_cfg(Path(output_dir) / "hydra_cfg.pickle", hydra_cfg)
    log.info("Running my_app")

    return "hello world"
Exemple #10
0
def setup_globals() -> None:
    def register(name: str, f: Any) -> None:
        try:
            OmegaConf.register_resolver(name, f)
        except AssertionError:
            # calling it again in no_workers mode will throw. safe to ignore.
            pass

    register("now", lambda pattern: strftime(pattern, localtime()))
    register(
        "hydra",
        lambda path: OmegaConf.select(cast(DictConfig, HydraConfig.get()), path),
    )
Exemple #11
0
def hydra_main(cfg):
    with open_dict(cfg):
        # make hydra logging work with ddp (see # see https://github.com/facebookresearch/hydra/issues/1126)
        cfg.job_logging_cfg = OmegaConf.to_container(
            HydraConfig.get().job_logging, resolve=True)

    cfg = OmegaConf.create(
        OmegaConf.to_container(cfg, resolve=False, enum_to_str=False))
    OmegaConf.set_struct(cfg, True)
    logger.info(cfg)
    _, score = main(cfg)

    if cfg.is_ax:
        return score, None
    return score
def main(config):
    # to load a checkpoint and perform inference, add +evaluate=<dir_of_experiment>
    if "evaluate" not in config:
        learner = get_learner(config)
        datasets = get_datasets(learner.data_dir, config.data.order, debug=config.debug_data)
        learner.training(datasets)
        learner.write_metrics()
        if config.save_checkpoint:
            learner.save_checkpoint()
    else:
        # no training, just evaluate
        logger.info(f"Evaluating {config.evaluate}..")
        experiment_path = Path(hydra.utils.to_absolute_path(EXPERIMENT_DIR / config.evaluate))
        config_file = experiment_path / '.hydra' / 'config.yaml'
        overrides = HydraConfig.get().overrides.task
        config = OmegaConf.load(config_file)
        config.wandb = False
        config.evaluate = True
        config.testing.n_samples_before_average_evaluate = 80
        config.testing.few_shot = False

        learner = get_learner(config, experiment_path=experiment_path)
        learner.load_checkpoint()
        datasets = get_datasets(learner.data_dir, config.data.order, debug=config.debug_data)

    # validation set
    logger.info("----------Validation starts here----------\n")
    learner.testing(datasets, order=datasets["order"])
    learner.write_metrics()
    validation_results = analyze_results(metrics_path=learner.results_dir / METRICS_FILE,
                                         use_wandb=config.wandb)

    # validation_results = {"validation_" + k : v for k, v in mean_results.items()}
    # pd.DataFrame.from_dict(results, orient="index").to_csv(learner.results_dir / "validation_results.csv")

    # test set
    # logger.info("----------Testing starts here----------")
    # results, mean_results = learner.testing(datasets["test"], order=datasets["order"])
    # mean_test_results = {"test_" + k : v for k, v in mean_results.items()}
    # pd.DataFrame.from_dict(results, orient="index").to_csv(learner.results_dir / "test_results.csv")

    if config.wandb:
        wandb.run.summary.update(validation_results)
        # wandb.log(validation_results)
        # wandb.log(mean_test_results)
        learner.wandb_run.finish()

    logger.info("------------------------- Run Finished -------------------------")
Exemple #13
0
def main(hydra_cfg):
    hydra_cfg.device = hydra_cfg.device.lower()
    with open_dict(hydra_cfg):
        hydra_cfg.job_logging_cfg = HydraConfig.get().job_logging
    # random seed
    if hydra_cfg.random_seed is None:
        hydra_cfg.random_seed = random.randint(1, 10000)
    set_random_seed(hydra_cfg.random_seed)

    if hydra_cfg.dist.gpus < 0:
        hydra_cfg.dist.gpus = torch.cuda.device_count()
    if hydra_cfg.device == "cpu" or hydra_cfg.dist.gpus == 0:
        hydra_cfg.dist.gpus = 0
        train_loop(0, hydra_cfg)
    else:
        distributed_run(train_loop, hydra_cfg)
Exemple #14
0
def _hydra_main(cfg: FairseqConfig, **kwargs) -> float:
    add_defaults(cfg)

    if cfg.common.reset_logging:
        reset_logging()  # Hydra hijacks logging, fix that
    else:
        # check if directly called or called through hydra_main
        if HydraConfig.initialized():
            with open_dict(cfg):
                # make hydra logging work with ddp (see # see https://github.com/facebookresearch/hydra/issues/1126)
                cfg.job_logging_cfg = OmegaConf.to_container(
                    HydraConfig.get().job_logging, resolve=True)

    with omegaconf_no_object_check():
        cfg = OmegaConf.create(
            OmegaConf.to_container(cfg, resolve=True, enum_to_str=True))
    OmegaConf.set_struct(cfg, True)

    try:
        if cfg.common.profile:
            with torch.cuda.profiler.profile():
                with torch.autograd.profiler.emit_nvtx():
                    distributed_utils.call_main(cfg, pre_main, **kwargs)
        else:
            distributed_utils.call_main(cfg, pre_main, **kwargs)
    except BaseException as e:
        if not cfg.common.suppress_crashes:
            raise
        else:
            logger.error("Crashed! " + str(e))

    # get best val and return - useful for sweepers
    try:
        best_val = metrics.get_smoothed_value(
            "valid", cfg.checkpoint.best_checkpoint_metric)
    except:
        best_val = None

    if best_val is None:
        best_val = float("inf")

    return best_val
Exemple #15
0
def main(hydra_cfg):
    hydra_cfg.device = hydra_cfg.device.lower()
    with open_dict(hydra_cfg):
        hydra_cfg.job_logging_cfg = HydraConfig.get().job_logging
    # random seed
    if hydra_cfg.random_seed is None:
        hydra_cfg.random_seed = random.randint(1, 10000)
    set_random_seed(hydra_cfg.random_seed)

    if hydra_cfg.dist.gpus < 0:
        hydra_cfg.dist.gpus = torch.cuda.device_count()
        hydra_cfg.dist.master_port = os.environ["MASTER_PORT"]
        hydra_cfg.dist.master_addr = os.environ["MASTER_ADDR"]
        print(hydra_cfg.dist)

    if hydra_cfg.device == "cpu" or hydra_cfg.dist.gpus == 0:
        hydra_cfg.dist.gpus = 0
        train_loop(0, hydra_cfg)
    else:
        distributed_run(train_loop, hydra_cfg)
Exemple #16
0
def setup_globals() -> None:
    def register(name: str, f: Any) -> None:
        try:
            OmegaConf.register_resolver(name, f)
        except AssertionError:
            # calling it again in no_workers mode will throw. safe to ignore.
            pass

    # please add documentation when you add a new resolver
    register("now", lambda pattern: strftime(pattern, localtime()))
    register(
        "hydra",
        lambda path: OmegaConf.select(cast(DictConfig, HydraConfig.get()), path),
    )

    vi = sys.version_info
    version_dict = {
        "major": f"{vi[0]}",
        "minor": f"{vi[0]}.{vi[1]}",
        "micro": f"{vi[0]}.{vi[1]}.{vi[2]}",
    }
    register("python_version", lambda level="minor": version_dict.get(level))
Exemple #17
0
def train_with_hydra(cfg: DictConfig):

    # ----------
    # TRAINING REPRODUCIBILITY
    # -----------

    # set this off if you want different random initialization every time
    seed_everything(cfg.trainer.seed_number, workers=cfg.trainer.workers)

    # ----------
    # INSTANTIATE DATASET FROM CONF
    # -----------

    # current experimentPath
    experimentPath = HydraConfig.get(
    )["runtime"]["cwd"] + "/outputs/" + cfg.experiment_name + "/exp_" + str(
        cfg.experiment_number)

    createDir(experimentPath, cfg)

    dm = instantiate(cfg.dataset)
    dm.setup()

    # ----------
    # INSTANTIATE MODEL FROM HYDRA CONF
    # -----------
    model = instantiate(cfg.model.anomaly_classifier)

    # load pytorch model pretrained model
    # model.load_pretrained_pytorch_model()

    # show model on output console

    # ----------
    # INSTANTIATE TRAINER FROM HYDRA CONF
    # -----------

    # instantiate trainer
    trainer = instantiate(cfg.trainer.default)

    # ----------
    # LOG HYDRA CONF AS ML FLOW ARTIFACT
    # -----------

    # To use MlFlow with Pytorch Lightning you need Run id because PL is using MlflowClient
    mlFlowRunId = trainer.logger[0].run_id
    mlFlowClient = trainer.logger.experiment[0]

    mlFlowArtifactPath = HydraConfig.get()["runtime"]["cwd"] + "/conf/"

    print("HYDRA RUNTIME CWD {}".format(HydraConfig.get()["runtime"]["cwd"]))

    modelFile = mlFlowArtifactPath + "model/" + \
        HydraConfig.get()["runtime"]["choices"]["model"] + ".yaml"
    trainerFile = mlFlowArtifactPath + "trainer/" + \
        HydraConfig.get()["runtime"]["choices"]["trainer"] + ".yaml"
    datasetFile = mlFlowArtifactPath + "dataset/" + \
        HydraConfig.get()["runtime"]["choices"]["dataset"] + ".yaml"

    # Save yaml file as mlflow artifacts
    mlFlowClient.log_artifact(mlFlowRunId, modelFile, "model")
    mlFlowClient.log_artifact(mlFlowRunId, trainerFile, "trainer")
    mlFlowClient.log_artifact(mlFlowRunId, datasetFile, "dataset")

    # Keep track of model path in Mlflow
    mlFlowClient.log_param(mlFlowRunId, "model_path", experimentPath)

    # Create folder for mlLogs

    # Path(cfg.trainer.default.logger.tracking_uri).mkdir(
    #      parents=True, exist_ok=True)

    if (cfg.general_cfg.phase == "train"):
        trainer.fit(model=model, datamodule=dm)

    elif cfg.general_cfg.phase == "test":
        trainer.test(model=model, datamodule=dm)
Exemple #18
0
def run(cfg: DictConfig) -> None:
    """
    Generic train loop

    :param cfg: run configuration, defined by Hydra in /conf
    """
    if cfg.train.deterministic:
        seed_everything(cfg.train.random_seed)

    if cfg.train.pl_trainer.fast_dev_run:
        hydra.utils.log.info(
            f"Debug mode <{cfg.train.pl_trainer.fast_dev_run=}>. "
            f"Forcing debugger friendly configuration!")
        # Debuggers don't like GPUs nor multiprocessing
        cfg.train.pl_trainer.gpus = 0
        cfg.data.datamodule.num_workers.train = 0
        cfg.data.datamodule.num_workers.val = 0
        cfg.data.datamodule.num_workers.test = 0

    # Hydra run directory
    hydra_dir = Path(HydraConfig.get().run.dir)

    # Instantiate datamodule
    hydra.utils.log.info(f"Instantiating <{cfg.data.datamodule._target_}>")
    datamodule: pl.LightningDataModule = hydra.utils.instantiate(
        cfg.data.datamodule, cfg=cfg)

    # Instantiate model
    hydra.utils.log.info(f"Instantiating <{cfg.model._target_}>")
    model: pl.LightningModule = hydra.utils.instantiate(cfg.model, cfg=cfg)

    # Instantiate the callbacks
    callbacks: List[Callback] = build_callbacks(cfg=cfg)

    # Logger instantiation/configuration
    wandb_logger = None
    if "wandb" in cfg.logging:
        hydra.utils.log.info(f"Instantiating <WandbLogger>")
        wandb_config = cfg.logging.wandb
        wandb_logger = WandbLogger(
            project=wandb_config.project,
            entity=wandb_config.entity,
            tags=cfg.core.tags,
            log_model=True,
        )
        hydra.utils.log.info(
            f"W&B is now watching <{wandb_config.watch.log}>!")
        wandb_logger.watch(model,
                           log=wandb_config.watch.log,
                           log_freq=wandb_config.watch.log_freq)

    hydra.utils.log.info(f"Instantiating the Trainer")

    # The Lightning core, the Trainer
    trainer = pl.Trainer(
        default_root_dir=hydra_dir,
        logger=wandb_logger,
        callbacks=callbacks,
        deterministic=cfg.train.deterministic,
        val_check_interval=cfg.logging.val_check_interval,
        progress_bar_refresh_rate=cfg.logging.progress_bar_refresh_rate,
        **cfg.train.pl_trainer,
    )

    hydra.utils.log.info(f"Starting training!")
    trainer.fit(model=model, datamodule=datamodule)

    hydra.utils.log.info(f"Starting testing!")
    trainer.test(model=model, datamodule=datamodule)

    shutil.copytree(".hydra", Path(wandb_logger.experiment.dir) / "hydra")

    # Logger closing to release resources/avoid multi-run conflicts
    if wandb_logger is not None:
        wandb_logger.experiment.finish()
Exemple #19
0
    def simulate(self, input_mappings):
        """Simulate multiple mappings.

        Args:
            input_mappings: input mappings

        Returns:
            list of the objects of the class `SimulationResult`. The length of
            the list is equal to the length of `input_mappings`.
        """
        # check inputs
        if len(input_mappings) == 0:
            log.warning("Trying to simulate an empty mapping list")
            return []

        time = process_time()
        if isinstance(input_mappings[0], Mapping):
            tup = [
                tuple(self.representation.toRepresentation(m))
                for m in input_mappings
            ]
            mappings = input_mappings
        else:  # assume mappings are list type then
            # transform into tuples
            tup = [
                tuple(self.representation.approximate(np.array(m)))
                for m in input_mappings
            ]
            mappings = [self.representation.fromRepresentation(m) for m in tup]
        self.statistics.add_rep_time(process_time() - time)

        # first look up as many as possible:
        lookups = [self.lookup(t) for t in tup]
        num = len([m for m in lookups if m])
        log.info(f"{num} from cache.")
        self.statistics.mappings_cached(num)

        # if all were already cached, return them
        if num == len(tup):
            return lookups

        # create a list of simulations to be run.
        # each element is a tuple (simulation, hydra_configuration)
        simulations = []
        # Logging are not configured in the spawned processes on mac OS.
        # As a workaround, suggested in
        # https://github.com/facebookresearch/hydra/issues/1005
        # we pass the hydra configuration to the child processes
        cfg_pickled = None
        if HydraConfig.initialized():
            config = HydraConfig.get()
            cfg_pickled = cloudpickle.dumps(config)
        for i, mapping in enumerate(mappings):
            # skip if this particular mapping is in the cache
            if lookups[i]:
                continue

            simulation = DataflowSimulation(self.platform, self.graph, mapping,
                                            self.trace)

            simulations.append((simulation, cfg_pickled))
        if self.parallel and len(simulations) > self.chunk_size:
            # since mappings are simulated in parallel, whole simulation time
            # is added later as offset
            for _ in simulations:
                self.statistics.mapping_evaluated(0)

            # run the simulations in parallel
            with mp.Pool(processes=self.jobs) as pool:
                to_simulate = pool.imap(
                    run_simulation_logger_wrapper,
                    simulations,
                    chunksize=self.chunk_size,
                )
                if self.progress:
                    import tqdm

                    to_simulate = tqdm.tqdm(
                        to_simulate,
                        total=len(mappings),
                    )
                simulated = list(to_simulate)
                time = sum([s[1] for s in simulated])
                simulated = [s[0] for s in simulated]
                self.statistics.add_offset(time)
        else:
            simulated = []
            # run the simulations sequentially
            for s in simulations:
                s, time = run_simulation(s[0])
                simulated.append(s)
                self.statistics.mapping_evaluated(time)

        # Collect the simulation results and store them
        sim_results = []
        sim_iter = iter(simulated)
        for i, mapping in enumerate(mappings):
            sim_res = lookups[i]
            if sim_res:
                sim_results.append(sim_res)
            else:
                s = next(sim_iter)
                sim_results.append(s.result)
                self.add_mapping_result(tup[i], s.result)
        return sim_results
Exemple #20
0
def my_app(_: DictConfig) -> None:
    run_dir = str(Path.cwd().relative_to(get_original_cwd()))
    time.sleep(2)
    run_dir_after_sleep = str(Path(HydraConfig.get().run.dir))
    assert run_dir == run_dir_after_sleep
Exemple #21
0
 def __call__(self, cfg: DictConfig) -> None:
     print(self._state)
     print(HydraConfig.get().job.name)
Exemple #22
0
def run(cfg: DictConfig) -> None:
    """
    Generic train loop

    :param cfg: run configuration, defined by Hydra in /conf
    """
    if cfg.train.deterministic:
        seed_everything(cfg.train.random_seed)

    if cfg.train.pl_trainer.fast_dev_run:
        hydra.utils.log.info(
            f"Debug mode <fast_dev_run{cfg.train.pl_trainer.fast_dev_run}>. "
            f"Forcing debugger friendly configuration!")
        # Debuggers don't like GPUs nor multiprocessing
        cfg.train.pl_trainer.gpus = 0
        cfg.data.datamodule.num_workers.train = 0
        cfg.data.datamodule.num_workers.val = 0
        cfg.data.datamodule.num_workers.test = 0

        # Switch wandb mode to offline to prevent online logging
        cfg.logging.wandb.mode = "offline"

    # Hydra run directory
    hydra_dir = Path(HydraConfig.get().run.dir)

    # Instantiate datamodule
    hydra.utils.log.info(f"Instantiating <{cfg.data.datamodule._target_}>")
    datamodule: pl.LightningDataModule = hydra.utils.instantiate(
        cfg.data.datamodule, _recursive_=False)

    # Instantiate model
    hydra.utils.log.info(f"Instantiating <{cfg.model._target_}>")
    model: pl.LightningModule = hydra.utils.instantiate(
        cfg.model,
        physics=cfg.physics,
        optim=cfg.optim,
        data=cfg.data,
        logging=cfg.logging,
        _recursive_=False,
    )

    # Instantiate the callbacks
    callbacks: List[Callback] = build_callbacks(cfg=cfg)

    # Logger instantiation/configuration
    wandb_logger = None
    if "wandb" in cfg.logging:
        hydra.utils.log.info(f"Instantiating <WandbLogger>")
        wandb_config = cfg.logging.wandb
        wandb_logger = WandbLogger(
            **wandb_config,
            tags=cfg.core.tags,
        )
        hydra.utils.log.info(
            f"W&B is now watching <{cfg.logging.wandb_watch.log}>!")
        wandb_logger.watch(
            model,
            log=cfg.logging.wandb_watch.log,
            log_freq=cfg.logging.wandb_watch.log_freq,
        )

    # Store the YaML config separately into the wandb dir
    yaml_conf: str = OmegaConf.to_yaml(cfg=cfg)
    (Path(wandb_logger.experiment.dir) / "hparams.yaml").write_text(yaml_conf)

    hydra.utils.log.info(f"Instantiating the Trainer")

    # The Lightning core, the Trainer
    trainer = pl.Trainer(
        default_root_dir=hydra_dir,
        logger=wandb_logger,
        callbacks=callbacks,
        deterministic=cfg.train.deterministic,
        val_check_interval=cfg.logging.val_check_interval,
        progress_bar_refresh_rate=cfg.logging.progress_bar_refresh_rate,
        **cfg.train.pl_trainer,
    )
    log_hyperparameters(trainer=trainer, model=model, cfg=cfg)

    hydra.utils.log.info(f"Starting training!")
    trainer.fit(model=model, datamodule=datamodule)

    hydra.utils.log.info(f"Starting testing!")
    trainer.test(model=model, datamodule=datamodule)

    # Logger closing to release resources/avoid multi-run conflicts
    if wandb_logger is not None:
        wandb_logger.experiment.finish()
Exemple #23
0
def run(cfg: DictConfig) -> None:
    """
    Generic train loop

    :param cfg: run configuration, defined by hydra in /conf
    """
    if cfg.train.deterministic:
        seed_everything(cfg.train.random_seed)

    # Hydra run directory
    hydra_dir = Path(HydraConfig.get().run.dir)

    # Instantiate datamodule
    hydra.utils.log.info(f"Instantiating <{cfg.data.datamodule._target_}>")
    datamodule: pl.LightningDataModule = hydra.utils.instantiate(
        cfg.data.datamodule, cfg=cfg)

    # Instantiate model
    hydra.utils.log.info(f"Instantiating <{cfg.model._target_}>")
    model: pl.LightningModule = hydra.utils.instantiate(cfg.model, cfg=cfg)

    callbacks = []
    if "lr_monitor" in cfg.logging:
        hydra.utils.log.info(f"Adding callback <LearningRateMonitor>")
        callbacks.append(
            LearningRateMonitor(
                logging_interval=cfg.logging.lr_monitor.logging_interval,
                log_momentum=cfg.logging.lr_monitor.log_momentum,
            ))

    if "early_stopping" in cfg.train:
        hydra.utils.log.info(f"Adding callback <EarlyStopping>")
        callbacks.append(
            EarlyStopping(
                monitor=cfg.train.monitor_metric,
                mode=cfg.train.monitor_metric_mode,
                patience=cfg.train.early_stopping.patience,
                verbose=cfg.train.early_stopping.verbose,
            ))

    if "model_checkpoints" in cfg.train.model_checkpoints:
        hydra.utils.log.info(f"Adding callback <ModelCheckpoint>")
        callbacks.append(
            ModelCheckpoint(
                monitor=cfg.train.monitor_metric,
                mode=cfg.train.monitor_metric_mode,
                save_top_k=cfg.train.model_checkpoints.save_top_k,
                verbose=cfg.train.model_checkpoints.verbose,
            ))

    wandb_logger = None
    if "wandb" in cfg.logging:
        hydra.utils.log.info(f"Instantiating <WandbLogger>")
        wandb_config = cfg.logging.wandb
        wandb_logger = WandbLogger(
            project=wandb_config.project,
            entity=wandb_config.entity,
            tags=cfg.core.tags,
            log_model=True,
        )
        hydra.utils.log.info(
            f"W&B is now watching <{wandb_config.watch.log}>!")
        wandb_logger.watch(model,
                           log=wandb_config.watch.log,
                           log_freq=wandb_config.watch.log_freq)

    hydra.utils.log.info(f"Instantiating the Trainer")
    trainer = pl.Trainer(
        fast_dev_run=True,
        default_root_dir=hydra_dir,
        logger=wandb_logger,
        callbacks=callbacks,
        deterministic=cfg.train.deterministic,
        val_check_interval=cfg.logging.val_check_interval,
        progress_bar_refresh_rate=cfg.logging.progress_bar_refresh_rate,
        **cfg.train.pl_trainer,
    )

    hydra.utils.log.info(f"Starting training!")
    trainer.fit(model=model, datamodule=datamodule)

    hydra.utils.log.info(f"Starting testing!")
    trainer.test(model=model, datamodule=datamodule)

    shutil.copytree(".hydra", Path(wandb_logger.experiment.dir) / "hydra")
def set_job_name(cfg, choices_list):
    hydra_cfg = HydraConfig.get()
    cfg.job.name = '_'.join([hydra_cfg.choices[group] for group in choices_list if group in hydra_cfg.choices])
Exemple #25
0
def main(_: DictConfig) -> None:
    subdir = Path(HydraConfig.get().run.dir) / Path("subdir")
    subdir.mkdir(exist_ok=True, parents=True)
    os.chdir(subdir)
Exemple #26
0
def my_app(_: DictConfig) -> None:
    print(HydraConfig.get().mode)
def experiment(_cfg: DictConfig) -> None:
    print(HydraConfig.get().job.name)
Exemple #28
0
def train_with_hydra(cfg: DictConfig):

    # ----------
    # TRAINING REPRODUCIBILITY
    # -----------

    # print augmentation applied
    #print(instantiate(cfg.dataset.train_transform, _convert_="all").transform.tfms_list)

    # set this off if you want different random initialization every time
    seed_everything(42, workers=cfg.trainer.workers)

    # Dataclass for custom Image transform See dataset configuration in .yaml
    @dataclass
    class ObjectDetectionInputTransform(InputTransform):

        # transforms added to training data
        def train_per_sample_transform(self):
            return instantiate(cfg.dataset.train_transform, _convert_="all")

        # transforms added to validation data
        def val_per_sample_transform(self):
            return instantiate(cfg.dataset.val_transform, _convert_="all")

    # ----------
    # INSTANTIATE DATASET FROM HYDRA CONF
    # -----------

    # train_trans = instantiate(cfg.dataset.train_transform, _convert_="all")

    # print(train_trans.transform.tfms_list)

    # input("TEST")

    dm = ObjectDetectionData.from_coco(
        train_folder=cfg.dataset.path_folder_images,
        train_ann_file=cfg.dataset.path_folder_annotations,
        val_split=cfg.dataset.val_split,
        train_transform=ObjectDetectionInputTransform,
        val_transform=ObjectDetectionInputTransform,
        batch_size=cfg.dataset.batch_size)

    # ----------
    # INSTANTIATE MODEL FROM HYDRA CONF
    # -----------

    # setup number of class at runtime. Number of output classes
    cfg.model.model.num_classes = dm.num_classes

    model = instantiate(cfg.model.model)

    # ----------
    # INSTANTIATE TRAINER FROM HYDRA CONF
    # -----------

    # instantiate trainer

    trainer = instantiate(cfg.trainer.default)

    # ----------
    # LOG HYDRA CONF AS ML FLOW ARTIFACT
    # -----------

    # To use MlFlow with Pytorch Lightning you need Run id because PL is using MlflowClient
    mlFlowRunId = trainer.logger[0].run_id
    mlFlowClient = trainer.logger.experiment[0]

    mlFlowArtifactPath = HydraConfig.get()["runtime"]["cwd"] + "/conf/"

    print("HYDRA RUNTIME CWD {}".format(HydraConfig.get()["runtime"]["cwd"]))

    modelFile = mlFlowArtifactPath + "model/" + \
        HydraConfig.get()["runtime"]["choices"]["model"] + ".yaml"
    trainerFile = mlFlowArtifactPath + "trainer/" + \
        HydraConfig.get()["runtime"]["choices"]["trainer"] + ".yaml"
    datasetFile = mlFlowArtifactPath + "dataset/" + \
        HydraConfig.get()["runtime"]["choices"]["dataset"] + ".yaml"

    # Save yaml file as mlflow artifacts
    mlFlowClient.log_artifact(mlFlowRunId, modelFile, "model")
    mlFlowClient.log_artifact(mlFlowRunId, trainerFile, "trainer")
    mlFlowClient.log_artifact(mlFlowRunId, datasetFile, "dataset")

    # Keep track to model path
    mlFlowClient.log_param(
        mlFlowRunId, "model_path",
        HydraConfig.get()["runtime"]["cwd"] + "/" + cfg.experiment_name +
        "/exp_" + str(cfg.experiment_number))

    # Path to saved models

    if not os.path.exists(trainer.checkpoint_callback.dirpath):
        os.makedirs(trainer.checkpoint_callback.dirpath)

    # ----------
    # START TRAINING
    # -----------

    if cfg.model.from_scratch:
        print("FROM SCRATCH")
        trainer.fit(model=model, datamodule=dm)

    else:

        if (cfg.trainer.ckpt_path != None):
            print("RESUME TRAINING")
            model = model.load_from_checkpoint(cfg.trainer.ckpt_path)

        trainer.finetune(model=model,
                         datamodule=dm,
                         strategy=cfg.trainer.strategy)
def reconstruct_cmd() -> str:
    """Reconstruct the python command that was used to start this program."""
    internal_config = HydraConfig.get()
    program = internal_config.job.name + ".py"
    args = internal_config.overrides.task
    return _join([program] + OmegaConf.to_container(args))  # type: ignore[operator]