def setup_multirun(): from hydra.core.hydra_config import HydraConfig if 'num' in HydraConfig.get().job: job_num = HydraConfig.get().job.num % torch.cuda.device_count() gpu = job_num % torch.cuda.device_count() torch.cuda.set_device(gpu) print(f'Job number {job_num:2d}') print(f'Setting active GPU to {gpu}')
def test_configuration_set_via_cmd_and_default_config( sweep_runner: TSweepRunner, ) -> None: sweep = sweep_runner( calling_file="tests/test_ax_sweeper_plugin.py", calling_module=None, task_function=quadratic, config_path="config", config_name="config.yaml", overrides=[ "hydra/launcher=basic", "hydra.sweeper.params.ax_config.max_trials=2", "hydra.sweeper.params.ax_config.early_stop.max_epochs_without_improvement=2", "quadratic=basic", "quadratic.x=-5:-2", "quadratic.y=-1:1", ], ) with sweep: ax_config = HydraConfig.get().sweeper.params.ax_config assert ax_config.max_trials == 2 assert ax_config.early_stop.max_epochs_without_improvement == 2 assert ax_config.experiment.minimize is True assert sweep.returns is None returns = OmegaConf.load(f"{sweep.temp_dir}/optimization_results.yaml") assert isinstance(returns, DictConfig) best_parameters = returns["ax"] assert "quadratic_x" in best_parameters assert "quadratic_y" in best_parameters
def setup_globals() -> None: # please add documentation when you add a new resolver OmegaConf.register_new_resolver( "now", lambda pattern: datetime.now().strftime(pattern), use_cache=True, replace=True, ) OmegaConf.register_new_resolver( "hydra", lambda path: OmegaConf.select(cast(DictConfig, HydraConfig.get()), path ), replace=True, ) vi = sys.version_info version_dict = { "major": f"{vi[0]}", "minor": f"{vi[0]}.{vi[1]}", "micro": f"{vi[0]}.{vi[1]}.{vi[2]}", } OmegaConf.register_new_resolver( "python_version", lambda level="minor": version_dict.get(level), replace=True)
def _get_job_id() -> int: try: return HydraConfig.get().job.id except MissingMandatoryValue: # The numeric job ID is missing if not in a multirun context. In that # case, there can only be a single run. return 0
def launch_jobs(temp_dir: str) -> None: runs = [] with open(os.path.join(temp_dir, JOB_SPEC_PICKLE), "rb") as f: job_spec = pickle.load(f) # nosec singleton_state = job_spec["singleton_state"] sweep_configs = job_spec["sweep_configs"] task_function = job_spec["task_function"] instance_id = _get_instance_id() sweep_dir = None for sweep_config in sweep_configs: with open_dict(sweep_config): sweep_config.hydra.job.id = ( f"{instance_id}_{sweep_config.hydra.job.num}" ) setup_globals() Singleton.set_state(singleton_state) HydraConfig.instance().set_config(sweep_config) ray_init_cfg = sweep_config.hydra.launcher.ray_init_cfg ray_remote_cfg = sweep_config.hydra.launcher.ray_remote_cfg if not sweep_dir: sweep_dir = Path(str(HydraConfig.get().sweep.dir)) sweep_dir.mkdir(parents=True, exist_ok=True) start_ray(ray_init_cfg) ray_obj = launch_job_on_ray( ray_remote_cfg, sweep_config, task_function, singleton_state ) runs.append(ray_obj) result = [ray.get(run) for run in runs] _dump_job_return(result, temp_dir)
def launch( launcher: RayAWSLauncher, job_overrides: Sequence[Sequence[str]], initial_job_idx: int, ) -> Sequence[JobReturn]: setup_globals() assert launcher.config is not None assert launcher.config_loader is not None assert launcher.task_function is not None setup_commands = launcher.env_setup.commands with read_write(setup_commands): setup_commands.extend([ f"pip install {package}=={version}" for package, version in launcher.env_setup.pip_packages.items() ]) setup_commands.extend(launcher.ray_cfg.cluster.setup_commands) with read_write(launcher.ray_cfg.cluster): launcher.ray_cfg.cluster.setup_commands = setup_commands configure_log(launcher.config.hydra.hydra_logging, launcher.config.hydra.verbose) log.info(f"Ray Launcher is launching {len(job_overrides)} jobs, ") with tempfile.TemporaryDirectory() as local_tmp_dir: sweep_configs = [] for idx, overrides in enumerate(job_overrides): idx = initial_job_idx + idx ostr = " ".join(filter_overrides(overrides)) log.info(f"\t#{idx} : {ostr}") sweep_config = launcher.config_loader.load_sweep_config( launcher.config, list(overrides)) with open_dict(sweep_config): # job.id will be set on the EC2 instance before running the job. sweep_config.hydra.job.num = idx sweep_configs.append(sweep_config) _pickle_jobs( tmp_dir=local_tmp_dir, sweep_configs=sweep_configs, # type: ignore task_function=launcher.task_function, singleton_state=Singleton.get_state(), ) with tempfile.NamedTemporaryFile(suffix=".yaml", delete=False) as f: with open(f.name, "w") as file: OmegaConf.save(config=launcher.ray_cfg.cluster, f=file.name, resolve=True) launcher.ray_yaml_path = f.name log.info( f"Saving RayClusterConf in a temp yaml file: {launcher.ray_yaml_path}." ) return launch_jobs(launcher, local_tmp_dir, Path(HydraConfig.get().sweep.dir))
def get_original_cwd() -> str: """ :return: the original working directory the Hydra application was launched from """ if not HydraConfig.initialized(): raise ValueError( "get_original_cwd() must only be used after HydraConfig is initialized" ) ret = HydraConfig.get().runtime.cwd assert ret is not None and isinstance(ret, str) return ret
def run_job( config: DictConfig, task_function: TaskFunction, job_dir_key: str, job_subdir_key: Optional[str], ) -> "JobReturn": old_cwd = os.getcwd() working_dir = str(OmegaConf.select(config, job_dir_key)) if job_subdir_key is not None: # evaluate job_subdir_key lazily. # this is running on the client side in sweep and contains things such as job:id which # are only available there. subdir = str(OmegaConf.select(config, job_subdir_key)) working_dir = os.path.join(working_dir, subdir) try: ret = JobReturn() ret.working_dir = working_dir task_cfg = copy.deepcopy(config) with read_write(task_cfg): with open_dict(task_cfg): del task_cfg["hydra"] ret.cfg = task_cfg ret.hydra_cfg = OmegaConf.create({"hydra": HydraConfig.get()}) overrides = OmegaConf.to_container(config.hydra.overrides.task) assert isinstance(overrides, list) ret.overrides = overrides # handle output directories here Path(str(working_dir)).mkdir(parents=True, exist_ok=True) os.chdir(working_dir) configure_log(config.hydra.job_logging, config.hydra.verbose) hydra_cfg = OmegaConf.masked_copy(config, "hydra") assert isinstance(hydra_cfg, DictConfig) if config.hydra.output_subdir is not None: hydra_output = Path(config.hydra.output_subdir) _save_config(task_cfg, "config.yaml", hydra_output) _save_config(hydra_cfg, "hydra.yaml", hydra_output) _save_config(config.hydra.overrides.task, "overrides.yaml", hydra_output) with env_override(hydra_cfg.hydra.job.env_set): ret.return_value = task_function(task_cfg) ret.task_name = JobRuntime.instance().get("name") # shut down logging to ensure job log files are closed. # If logging is still required after run_job caller is responsible to re-initialize it. logging.shutdown() return ret finally: os.chdir(old_cwd)
def my_app(cfg: DictConfig) -> str: def pickle_cfg(path: Path, obj: Any) -> Any: with open(str(path), "wb") as file: pickle.dump(obj, file) hydra_cfg = HydraConfig.get() output_dir = Path(hydra_cfg.runtime.output_dir) pickle_cfg(Path(output_dir) / "task_cfg.pickle", cfg) pickle_cfg(Path(output_dir) / "hydra_cfg.pickle", hydra_cfg) log.info("Running my_app") return "hello world"
def setup_globals() -> None: def register(name: str, f: Any) -> None: try: OmegaConf.register_resolver(name, f) except AssertionError: # calling it again in no_workers mode will throw. safe to ignore. pass register("now", lambda pattern: strftime(pattern, localtime())) register( "hydra", lambda path: OmegaConf.select(cast(DictConfig, HydraConfig.get()), path), )
def hydra_main(cfg): with open_dict(cfg): # make hydra logging work with ddp (see # see https://github.com/facebookresearch/hydra/issues/1126) cfg.job_logging_cfg = OmegaConf.to_container( HydraConfig.get().job_logging, resolve=True) cfg = OmegaConf.create( OmegaConf.to_container(cfg, resolve=False, enum_to_str=False)) OmegaConf.set_struct(cfg, True) logger.info(cfg) _, score = main(cfg) if cfg.is_ax: return score, None return score
def main(config): # to load a checkpoint and perform inference, add +evaluate=<dir_of_experiment> if "evaluate" not in config: learner = get_learner(config) datasets = get_datasets(learner.data_dir, config.data.order, debug=config.debug_data) learner.training(datasets) learner.write_metrics() if config.save_checkpoint: learner.save_checkpoint() else: # no training, just evaluate logger.info(f"Evaluating {config.evaluate}..") experiment_path = Path(hydra.utils.to_absolute_path(EXPERIMENT_DIR / config.evaluate)) config_file = experiment_path / '.hydra' / 'config.yaml' overrides = HydraConfig.get().overrides.task config = OmegaConf.load(config_file) config.wandb = False config.evaluate = True config.testing.n_samples_before_average_evaluate = 80 config.testing.few_shot = False learner = get_learner(config, experiment_path=experiment_path) learner.load_checkpoint() datasets = get_datasets(learner.data_dir, config.data.order, debug=config.debug_data) # validation set logger.info("----------Validation starts here----------\n") learner.testing(datasets, order=datasets["order"]) learner.write_metrics() validation_results = analyze_results(metrics_path=learner.results_dir / METRICS_FILE, use_wandb=config.wandb) # validation_results = {"validation_" + k : v for k, v in mean_results.items()} # pd.DataFrame.from_dict(results, orient="index").to_csv(learner.results_dir / "validation_results.csv") # test set # logger.info("----------Testing starts here----------") # results, mean_results = learner.testing(datasets["test"], order=datasets["order"]) # mean_test_results = {"test_" + k : v for k, v in mean_results.items()} # pd.DataFrame.from_dict(results, orient="index").to_csv(learner.results_dir / "test_results.csv") if config.wandb: wandb.run.summary.update(validation_results) # wandb.log(validation_results) # wandb.log(mean_test_results) learner.wandb_run.finish() logger.info("------------------------- Run Finished -------------------------")
def main(hydra_cfg): hydra_cfg.device = hydra_cfg.device.lower() with open_dict(hydra_cfg): hydra_cfg.job_logging_cfg = HydraConfig.get().job_logging # random seed if hydra_cfg.random_seed is None: hydra_cfg.random_seed = random.randint(1, 10000) set_random_seed(hydra_cfg.random_seed) if hydra_cfg.dist.gpus < 0: hydra_cfg.dist.gpus = torch.cuda.device_count() if hydra_cfg.device == "cpu" or hydra_cfg.dist.gpus == 0: hydra_cfg.dist.gpus = 0 train_loop(0, hydra_cfg) else: distributed_run(train_loop, hydra_cfg)
def _hydra_main(cfg: FairseqConfig, **kwargs) -> float: add_defaults(cfg) if cfg.common.reset_logging: reset_logging() # Hydra hijacks logging, fix that else: # check if directly called or called through hydra_main if HydraConfig.initialized(): with open_dict(cfg): # make hydra logging work with ddp (see # see https://github.com/facebookresearch/hydra/issues/1126) cfg.job_logging_cfg = OmegaConf.to_container( HydraConfig.get().job_logging, resolve=True) with omegaconf_no_object_check(): cfg = OmegaConf.create( OmegaConf.to_container(cfg, resolve=True, enum_to_str=True)) OmegaConf.set_struct(cfg, True) try: if cfg.common.profile: with torch.cuda.profiler.profile(): with torch.autograd.profiler.emit_nvtx(): distributed_utils.call_main(cfg, pre_main, **kwargs) else: distributed_utils.call_main(cfg, pre_main, **kwargs) except BaseException as e: if not cfg.common.suppress_crashes: raise else: logger.error("Crashed! " + str(e)) # get best val and return - useful for sweepers try: best_val = metrics.get_smoothed_value( "valid", cfg.checkpoint.best_checkpoint_metric) except: best_val = None if best_val is None: best_val = float("inf") return best_val
def main(hydra_cfg): hydra_cfg.device = hydra_cfg.device.lower() with open_dict(hydra_cfg): hydra_cfg.job_logging_cfg = HydraConfig.get().job_logging # random seed if hydra_cfg.random_seed is None: hydra_cfg.random_seed = random.randint(1, 10000) set_random_seed(hydra_cfg.random_seed) if hydra_cfg.dist.gpus < 0: hydra_cfg.dist.gpus = torch.cuda.device_count() hydra_cfg.dist.master_port = os.environ["MASTER_PORT"] hydra_cfg.dist.master_addr = os.environ["MASTER_ADDR"] print(hydra_cfg.dist) if hydra_cfg.device == "cpu" or hydra_cfg.dist.gpus == 0: hydra_cfg.dist.gpus = 0 train_loop(0, hydra_cfg) else: distributed_run(train_loop, hydra_cfg)
def setup_globals() -> None: def register(name: str, f: Any) -> None: try: OmegaConf.register_resolver(name, f) except AssertionError: # calling it again in no_workers mode will throw. safe to ignore. pass # please add documentation when you add a new resolver register("now", lambda pattern: strftime(pattern, localtime())) register( "hydra", lambda path: OmegaConf.select(cast(DictConfig, HydraConfig.get()), path), ) vi = sys.version_info version_dict = { "major": f"{vi[0]}", "minor": f"{vi[0]}.{vi[1]}", "micro": f"{vi[0]}.{vi[1]}.{vi[2]}", } register("python_version", lambda level="minor": version_dict.get(level))
def train_with_hydra(cfg: DictConfig): # ---------- # TRAINING REPRODUCIBILITY # ----------- # set this off if you want different random initialization every time seed_everything(cfg.trainer.seed_number, workers=cfg.trainer.workers) # ---------- # INSTANTIATE DATASET FROM CONF # ----------- # current experimentPath experimentPath = HydraConfig.get( )["runtime"]["cwd"] + "/outputs/" + cfg.experiment_name + "/exp_" + str( cfg.experiment_number) createDir(experimentPath, cfg) dm = instantiate(cfg.dataset) dm.setup() # ---------- # INSTANTIATE MODEL FROM HYDRA CONF # ----------- model = instantiate(cfg.model.anomaly_classifier) # load pytorch model pretrained model # model.load_pretrained_pytorch_model() # show model on output console # ---------- # INSTANTIATE TRAINER FROM HYDRA CONF # ----------- # instantiate trainer trainer = instantiate(cfg.trainer.default) # ---------- # LOG HYDRA CONF AS ML FLOW ARTIFACT # ----------- # To use MlFlow with Pytorch Lightning you need Run id because PL is using MlflowClient mlFlowRunId = trainer.logger[0].run_id mlFlowClient = trainer.logger.experiment[0] mlFlowArtifactPath = HydraConfig.get()["runtime"]["cwd"] + "/conf/" print("HYDRA RUNTIME CWD {}".format(HydraConfig.get()["runtime"]["cwd"])) modelFile = mlFlowArtifactPath + "model/" + \ HydraConfig.get()["runtime"]["choices"]["model"] + ".yaml" trainerFile = mlFlowArtifactPath + "trainer/" + \ HydraConfig.get()["runtime"]["choices"]["trainer"] + ".yaml" datasetFile = mlFlowArtifactPath + "dataset/" + \ HydraConfig.get()["runtime"]["choices"]["dataset"] + ".yaml" # Save yaml file as mlflow artifacts mlFlowClient.log_artifact(mlFlowRunId, modelFile, "model") mlFlowClient.log_artifact(mlFlowRunId, trainerFile, "trainer") mlFlowClient.log_artifact(mlFlowRunId, datasetFile, "dataset") # Keep track of model path in Mlflow mlFlowClient.log_param(mlFlowRunId, "model_path", experimentPath) # Create folder for mlLogs # Path(cfg.trainer.default.logger.tracking_uri).mkdir( # parents=True, exist_ok=True) if (cfg.general_cfg.phase == "train"): trainer.fit(model=model, datamodule=dm) elif cfg.general_cfg.phase == "test": trainer.test(model=model, datamodule=dm)
def run(cfg: DictConfig) -> None: """ Generic train loop :param cfg: run configuration, defined by Hydra in /conf """ if cfg.train.deterministic: seed_everything(cfg.train.random_seed) if cfg.train.pl_trainer.fast_dev_run: hydra.utils.log.info( f"Debug mode <{cfg.train.pl_trainer.fast_dev_run=}>. " f"Forcing debugger friendly configuration!") # Debuggers don't like GPUs nor multiprocessing cfg.train.pl_trainer.gpus = 0 cfg.data.datamodule.num_workers.train = 0 cfg.data.datamodule.num_workers.val = 0 cfg.data.datamodule.num_workers.test = 0 # Hydra run directory hydra_dir = Path(HydraConfig.get().run.dir) # Instantiate datamodule hydra.utils.log.info(f"Instantiating <{cfg.data.datamodule._target_}>") datamodule: pl.LightningDataModule = hydra.utils.instantiate( cfg.data.datamodule, cfg=cfg) # Instantiate model hydra.utils.log.info(f"Instantiating <{cfg.model._target_}>") model: pl.LightningModule = hydra.utils.instantiate(cfg.model, cfg=cfg) # Instantiate the callbacks callbacks: List[Callback] = build_callbacks(cfg=cfg) # Logger instantiation/configuration wandb_logger = None if "wandb" in cfg.logging: hydra.utils.log.info(f"Instantiating <WandbLogger>") wandb_config = cfg.logging.wandb wandb_logger = WandbLogger( project=wandb_config.project, entity=wandb_config.entity, tags=cfg.core.tags, log_model=True, ) hydra.utils.log.info( f"W&B is now watching <{wandb_config.watch.log}>!") wandb_logger.watch(model, log=wandb_config.watch.log, log_freq=wandb_config.watch.log_freq) hydra.utils.log.info(f"Instantiating the Trainer") # The Lightning core, the Trainer trainer = pl.Trainer( default_root_dir=hydra_dir, logger=wandb_logger, callbacks=callbacks, deterministic=cfg.train.deterministic, val_check_interval=cfg.logging.val_check_interval, progress_bar_refresh_rate=cfg.logging.progress_bar_refresh_rate, **cfg.train.pl_trainer, ) hydra.utils.log.info(f"Starting training!") trainer.fit(model=model, datamodule=datamodule) hydra.utils.log.info(f"Starting testing!") trainer.test(model=model, datamodule=datamodule) shutil.copytree(".hydra", Path(wandb_logger.experiment.dir) / "hydra") # Logger closing to release resources/avoid multi-run conflicts if wandb_logger is not None: wandb_logger.experiment.finish()
def simulate(self, input_mappings): """Simulate multiple mappings. Args: input_mappings: input mappings Returns: list of the objects of the class `SimulationResult`. The length of the list is equal to the length of `input_mappings`. """ # check inputs if len(input_mappings) == 0: log.warning("Trying to simulate an empty mapping list") return [] time = process_time() if isinstance(input_mappings[0], Mapping): tup = [ tuple(self.representation.toRepresentation(m)) for m in input_mappings ] mappings = input_mappings else: # assume mappings are list type then # transform into tuples tup = [ tuple(self.representation.approximate(np.array(m))) for m in input_mappings ] mappings = [self.representation.fromRepresentation(m) for m in tup] self.statistics.add_rep_time(process_time() - time) # first look up as many as possible: lookups = [self.lookup(t) for t in tup] num = len([m for m in lookups if m]) log.info(f"{num} from cache.") self.statistics.mappings_cached(num) # if all were already cached, return them if num == len(tup): return lookups # create a list of simulations to be run. # each element is a tuple (simulation, hydra_configuration) simulations = [] # Logging are not configured in the spawned processes on mac OS. # As a workaround, suggested in # https://github.com/facebookresearch/hydra/issues/1005 # we pass the hydra configuration to the child processes cfg_pickled = None if HydraConfig.initialized(): config = HydraConfig.get() cfg_pickled = cloudpickle.dumps(config) for i, mapping in enumerate(mappings): # skip if this particular mapping is in the cache if lookups[i]: continue simulation = DataflowSimulation(self.platform, self.graph, mapping, self.trace) simulations.append((simulation, cfg_pickled)) if self.parallel and len(simulations) > self.chunk_size: # since mappings are simulated in parallel, whole simulation time # is added later as offset for _ in simulations: self.statistics.mapping_evaluated(0) # run the simulations in parallel with mp.Pool(processes=self.jobs) as pool: to_simulate = pool.imap( run_simulation_logger_wrapper, simulations, chunksize=self.chunk_size, ) if self.progress: import tqdm to_simulate = tqdm.tqdm( to_simulate, total=len(mappings), ) simulated = list(to_simulate) time = sum([s[1] for s in simulated]) simulated = [s[0] for s in simulated] self.statistics.add_offset(time) else: simulated = [] # run the simulations sequentially for s in simulations: s, time = run_simulation(s[0]) simulated.append(s) self.statistics.mapping_evaluated(time) # Collect the simulation results and store them sim_results = [] sim_iter = iter(simulated) for i, mapping in enumerate(mappings): sim_res = lookups[i] if sim_res: sim_results.append(sim_res) else: s = next(sim_iter) sim_results.append(s.result) self.add_mapping_result(tup[i], s.result) return sim_results
def my_app(_: DictConfig) -> None: run_dir = str(Path.cwd().relative_to(get_original_cwd())) time.sleep(2) run_dir_after_sleep = str(Path(HydraConfig.get().run.dir)) assert run_dir == run_dir_after_sleep
def __call__(self, cfg: DictConfig) -> None: print(self._state) print(HydraConfig.get().job.name)
def run(cfg: DictConfig) -> None: """ Generic train loop :param cfg: run configuration, defined by Hydra in /conf """ if cfg.train.deterministic: seed_everything(cfg.train.random_seed) if cfg.train.pl_trainer.fast_dev_run: hydra.utils.log.info( f"Debug mode <fast_dev_run{cfg.train.pl_trainer.fast_dev_run}>. " f"Forcing debugger friendly configuration!") # Debuggers don't like GPUs nor multiprocessing cfg.train.pl_trainer.gpus = 0 cfg.data.datamodule.num_workers.train = 0 cfg.data.datamodule.num_workers.val = 0 cfg.data.datamodule.num_workers.test = 0 # Switch wandb mode to offline to prevent online logging cfg.logging.wandb.mode = "offline" # Hydra run directory hydra_dir = Path(HydraConfig.get().run.dir) # Instantiate datamodule hydra.utils.log.info(f"Instantiating <{cfg.data.datamodule._target_}>") datamodule: pl.LightningDataModule = hydra.utils.instantiate( cfg.data.datamodule, _recursive_=False) # Instantiate model hydra.utils.log.info(f"Instantiating <{cfg.model._target_}>") model: pl.LightningModule = hydra.utils.instantiate( cfg.model, physics=cfg.physics, optim=cfg.optim, data=cfg.data, logging=cfg.logging, _recursive_=False, ) # Instantiate the callbacks callbacks: List[Callback] = build_callbacks(cfg=cfg) # Logger instantiation/configuration wandb_logger = None if "wandb" in cfg.logging: hydra.utils.log.info(f"Instantiating <WandbLogger>") wandb_config = cfg.logging.wandb wandb_logger = WandbLogger( **wandb_config, tags=cfg.core.tags, ) hydra.utils.log.info( f"W&B is now watching <{cfg.logging.wandb_watch.log}>!") wandb_logger.watch( model, log=cfg.logging.wandb_watch.log, log_freq=cfg.logging.wandb_watch.log_freq, ) # Store the YaML config separately into the wandb dir yaml_conf: str = OmegaConf.to_yaml(cfg=cfg) (Path(wandb_logger.experiment.dir) / "hparams.yaml").write_text(yaml_conf) hydra.utils.log.info(f"Instantiating the Trainer") # The Lightning core, the Trainer trainer = pl.Trainer( default_root_dir=hydra_dir, logger=wandb_logger, callbacks=callbacks, deterministic=cfg.train.deterministic, val_check_interval=cfg.logging.val_check_interval, progress_bar_refresh_rate=cfg.logging.progress_bar_refresh_rate, **cfg.train.pl_trainer, ) log_hyperparameters(trainer=trainer, model=model, cfg=cfg) hydra.utils.log.info(f"Starting training!") trainer.fit(model=model, datamodule=datamodule) hydra.utils.log.info(f"Starting testing!") trainer.test(model=model, datamodule=datamodule) # Logger closing to release resources/avoid multi-run conflicts if wandb_logger is not None: wandb_logger.experiment.finish()
def run(cfg: DictConfig) -> None: """ Generic train loop :param cfg: run configuration, defined by hydra in /conf """ if cfg.train.deterministic: seed_everything(cfg.train.random_seed) # Hydra run directory hydra_dir = Path(HydraConfig.get().run.dir) # Instantiate datamodule hydra.utils.log.info(f"Instantiating <{cfg.data.datamodule._target_}>") datamodule: pl.LightningDataModule = hydra.utils.instantiate( cfg.data.datamodule, cfg=cfg) # Instantiate model hydra.utils.log.info(f"Instantiating <{cfg.model._target_}>") model: pl.LightningModule = hydra.utils.instantiate(cfg.model, cfg=cfg) callbacks = [] if "lr_monitor" in cfg.logging: hydra.utils.log.info(f"Adding callback <LearningRateMonitor>") callbacks.append( LearningRateMonitor( logging_interval=cfg.logging.lr_monitor.logging_interval, log_momentum=cfg.logging.lr_monitor.log_momentum, )) if "early_stopping" in cfg.train: hydra.utils.log.info(f"Adding callback <EarlyStopping>") callbacks.append( EarlyStopping( monitor=cfg.train.monitor_metric, mode=cfg.train.monitor_metric_mode, patience=cfg.train.early_stopping.patience, verbose=cfg.train.early_stopping.verbose, )) if "model_checkpoints" in cfg.train.model_checkpoints: hydra.utils.log.info(f"Adding callback <ModelCheckpoint>") callbacks.append( ModelCheckpoint( monitor=cfg.train.monitor_metric, mode=cfg.train.monitor_metric_mode, save_top_k=cfg.train.model_checkpoints.save_top_k, verbose=cfg.train.model_checkpoints.verbose, )) wandb_logger = None if "wandb" in cfg.logging: hydra.utils.log.info(f"Instantiating <WandbLogger>") wandb_config = cfg.logging.wandb wandb_logger = WandbLogger( project=wandb_config.project, entity=wandb_config.entity, tags=cfg.core.tags, log_model=True, ) hydra.utils.log.info( f"W&B is now watching <{wandb_config.watch.log}>!") wandb_logger.watch(model, log=wandb_config.watch.log, log_freq=wandb_config.watch.log_freq) hydra.utils.log.info(f"Instantiating the Trainer") trainer = pl.Trainer( fast_dev_run=True, default_root_dir=hydra_dir, logger=wandb_logger, callbacks=callbacks, deterministic=cfg.train.deterministic, val_check_interval=cfg.logging.val_check_interval, progress_bar_refresh_rate=cfg.logging.progress_bar_refresh_rate, **cfg.train.pl_trainer, ) hydra.utils.log.info(f"Starting training!") trainer.fit(model=model, datamodule=datamodule) hydra.utils.log.info(f"Starting testing!") trainer.test(model=model, datamodule=datamodule) shutil.copytree(".hydra", Path(wandb_logger.experiment.dir) / "hydra")
def set_job_name(cfg, choices_list): hydra_cfg = HydraConfig.get() cfg.job.name = '_'.join([hydra_cfg.choices[group] for group in choices_list if group in hydra_cfg.choices])
def main(_: DictConfig) -> None: subdir = Path(HydraConfig.get().run.dir) / Path("subdir") subdir.mkdir(exist_ok=True, parents=True) os.chdir(subdir)
def my_app(_: DictConfig) -> None: print(HydraConfig.get().mode)
def experiment(_cfg: DictConfig) -> None: print(HydraConfig.get().job.name)
def train_with_hydra(cfg: DictConfig): # ---------- # TRAINING REPRODUCIBILITY # ----------- # print augmentation applied #print(instantiate(cfg.dataset.train_transform, _convert_="all").transform.tfms_list) # set this off if you want different random initialization every time seed_everything(42, workers=cfg.trainer.workers) # Dataclass for custom Image transform See dataset configuration in .yaml @dataclass class ObjectDetectionInputTransform(InputTransform): # transforms added to training data def train_per_sample_transform(self): return instantiate(cfg.dataset.train_transform, _convert_="all") # transforms added to validation data def val_per_sample_transform(self): return instantiate(cfg.dataset.val_transform, _convert_="all") # ---------- # INSTANTIATE DATASET FROM HYDRA CONF # ----------- # train_trans = instantiate(cfg.dataset.train_transform, _convert_="all") # print(train_trans.transform.tfms_list) # input("TEST") dm = ObjectDetectionData.from_coco( train_folder=cfg.dataset.path_folder_images, train_ann_file=cfg.dataset.path_folder_annotations, val_split=cfg.dataset.val_split, train_transform=ObjectDetectionInputTransform, val_transform=ObjectDetectionInputTransform, batch_size=cfg.dataset.batch_size) # ---------- # INSTANTIATE MODEL FROM HYDRA CONF # ----------- # setup number of class at runtime. Number of output classes cfg.model.model.num_classes = dm.num_classes model = instantiate(cfg.model.model) # ---------- # INSTANTIATE TRAINER FROM HYDRA CONF # ----------- # instantiate trainer trainer = instantiate(cfg.trainer.default) # ---------- # LOG HYDRA CONF AS ML FLOW ARTIFACT # ----------- # To use MlFlow with Pytorch Lightning you need Run id because PL is using MlflowClient mlFlowRunId = trainer.logger[0].run_id mlFlowClient = trainer.logger.experiment[0] mlFlowArtifactPath = HydraConfig.get()["runtime"]["cwd"] + "/conf/" print("HYDRA RUNTIME CWD {}".format(HydraConfig.get()["runtime"]["cwd"])) modelFile = mlFlowArtifactPath + "model/" + \ HydraConfig.get()["runtime"]["choices"]["model"] + ".yaml" trainerFile = mlFlowArtifactPath + "trainer/" + \ HydraConfig.get()["runtime"]["choices"]["trainer"] + ".yaml" datasetFile = mlFlowArtifactPath + "dataset/" + \ HydraConfig.get()["runtime"]["choices"]["dataset"] + ".yaml" # Save yaml file as mlflow artifacts mlFlowClient.log_artifact(mlFlowRunId, modelFile, "model") mlFlowClient.log_artifact(mlFlowRunId, trainerFile, "trainer") mlFlowClient.log_artifact(mlFlowRunId, datasetFile, "dataset") # Keep track to model path mlFlowClient.log_param( mlFlowRunId, "model_path", HydraConfig.get()["runtime"]["cwd"] + "/" + cfg.experiment_name + "/exp_" + str(cfg.experiment_number)) # Path to saved models if not os.path.exists(trainer.checkpoint_callback.dirpath): os.makedirs(trainer.checkpoint_callback.dirpath) # ---------- # START TRAINING # ----------- if cfg.model.from_scratch: print("FROM SCRATCH") trainer.fit(model=model, datamodule=dm) else: if (cfg.trainer.ckpt_path != None): print("RESUME TRAINING") model = model.load_from_checkpoint(cfg.trainer.ckpt_path) trainer.finetune(model=model, datamodule=dm, strategy=cfg.trainer.strategy)
def reconstruct_cmd() -> str: """Reconstruct the python command that was used to start this program.""" internal_config = HydraConfig.get() program = internal_config.job.name + ".py" args = internal_config.overrides.task return _join([program] + OmegaConf.to_container(args)) # type: ignore[operator]