covar_module = covar_module_dict[key] # Put them all together full_train_i = torch.cat([ torch.full_like(torch.tensor(data[d]["X"]), dtype=torch.long, fill_value=i) for i, d in enumerate(data) ]) full_train_x = torch.cat( [torch.tensor(data[d]["X"]).to(torch.float) for d in data]) full_train_y = torch.cat( [torch.tensor(data[d]["Y"]).to(torch.float) for d in data]) batch_size = 32 training_data = DataModule(full_train_x, full_train_y, full_train_i) logger = TensorBoardLogger("tb_logs", name="ocean_forcing") num_tasks = len(data) model = PLMultitaskGPModel( full_train_x, full_train_y, full_train_i, num_tasks, covar_module, ) lr_monitor = LearningRateMonitor(logging_interval="step") early_stop_callback = EarlyStopping( monitor="loss", min_delta=0.00, patience=100, verbose=False, mode="min",
from pytorch_lightning.callbacks import ModelCheckpoint from pytorch_lightning.loggers import TensorBoardLogger from imipnet.lightning_module import IMIPLightning parser = ArgumentParser() parser = IMIPLightning.add_model_specific_args(parser) args = parser.parse_args([ "--loss", "outlier-balanced-bce-bce-uml", "--n_top_patches", "16", "--eval_set", "kitti-gray-0.5", "--preprocess", "center" ]) imip_module = IMIPLightning(args) name = imip_module.get_new_run_name() logger = TensorBoardLogger("./runs", name) checkpoint_dir = os.path.join(".", "checkpoints", "simple-conv", name) os.makedirs(checkpoint_dir, exist_ok=True) checkpoint_callback = ModelCheckpoint( filepath=checkpoint_dir, save_last=True, verbose=True, monitor="eval_true_inliers", mode='max', period=0 # don't wait for a new epoch to save a better model ) overfit_val = args.overfit_n trainer = Trainer(logger=logger,
checkpoint_callback = ModelCheckpoint( monitor='val_loss_weighed', save_top_k=3, mode='min', ) train_set = torch.load("materials/JVASP/Train_raman_set_25_uneq_yolov1.pt") validate_set = torch.load( "materials/JVASP/Valid_raman_set_25_uneq_yolov1.pt") train_dataloader = DataLoader( dataset=train_set, batch_size=batch_size, num_workers=4, shuffle=True) validate_dataloader = DataLoader( dataset=validate_set, batch_size=batch_size, num_workers=4) experiment = Experiment(**model_hpparams) logger = TensorBoardLogger(prefix) if trainer_config == "tune": trainer = pl.Trainer(gpus=1 if torch.cuda.is_available() else 0, logger=logger, callbacks=[ checkpoint_callback], auto_lr_find=True) trainer.tune(experiment, train_dataloader, validate_dataloader) else: if checkpoint_path != None: trainer = pl.Trainer(resume_from_checkpoint=checkpoint_path, gpus=1 if torch.cuda.is_available( ) else 0, logger=logger, callbacks=[checkpoint_callback], max_epochs=epochs) else: trainer = pl.Trainer(gpus=1 if torch.cuda.is_available() else 0, logger=logger, callbacks=[checkpoint_callback], max_epochs=epochs) trainer.fit(experiment, train_dataloader, validate_dataloader)
# set validation data loader test_sampler = torch.utils.data.distributed.DistributedSampler( test_dataset, num_replicas=hvd.size(), rank=hvd.rank()) test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=args.test_batch_size, sampler=test_sampler, **kwargs) epochs = args.epochs with tempfile.TemporaryDirectory() as run_output_dir: ckpt_path = os.path.join(run_output_dir, "checkpoint") os.makedirs(ckpt_path, exist_ok=True) checkpoint_callback = ModelCheckpoint(dirpath=ckpt_path) logs_path = os.path.join(run_output_dir, "logger") os.makedirs(logs_path, exist_ok=True) logger = TensorBoardLogger(logs_path) train_percent = 1.0 val_percent = 1.0 model = Net() setattr(model, 'train_dataloader', lambda: train_loader) setattr(model, 'val_dataloader', lambda: test_loader) trainer = Trainer(accelerator='horovod', gpus=0, callbacks=None, max_epochs=epochs, limit_train_batches=train_percent, limit_val_batches=val_percent, logger=logger,
def get_default_logger(save_dir, version=None): # set up logger object without actually saving logs logger = TensorBoardLogger(save_dir, name='lightning_logs', version=version) return logger
def test_tensorboard_no_name(tmpdir, name): """Verify that None or empty name works.""" logger = TensorBoardLogger(save_dir=tmpdir, name=name) logger.log_hyperparams({"a": 1, "b": 2, 123: 3, 3.5: 4, 5j: 5}) # Force data to be written assert logger.root_dir == tmpdir assert os.listdir(tmpdir / "version_0")
def test_tensorboard_finalize(summary_writer, tmpdir): """Test that the SummaryWriter closes in finalize.""" logger = TensorBoardLogger(save_dir=tmpdir) logger.finalize("any") summary_writer().flush.assert_called() summary_writer().close.assert_called()
parser.add_argument( "--pod_template_spec", type=str, default=None, help="Pod template spec", ) parser = pl.Trainer.add_argparse_args(parent_parser=parser) args = vars(parser.parse_args()) # Enabling Tensorboard Logger, ModelCheckpoint, Earlystopping lr_logger = LearningRateMonitor() tboard = TensorBoardLogger(args["tensorboard_root"]) early_stopping = EarlyStopping(monitor="val_loss", mode="min", patience=5, verbose=True) checkpoint_callback = ModelCheckpoint( dirpath=args["checkpoint_dir"], filename="cifar10_{epoch:02d}", save_top_k=1, verbose=True, monitor="val_loss", mode="min", ) if not args["max_epochs"]: args["max_epochs"] = 1
def train(serialized_model): import horovod.torch as hvd # Horovod: initialize library. hvd.init() if verbose: import horovod as _horovod print( f"Shared lib path is pointing to: {_horovod.common.process_sets._basics.MPI_LIB_CTYPES}" ) _checkpoint_callback = None require_checkpoint = False with remote_store.get_local_output_dir() as run_output_dir: logs_path = os.path.join(run_output_dir, remote_store.logs_subdir) os.makedirs(logs_path, exist_ok=True) print(f"Made directory {logs_path} for horovod rank {hvd.rank()}") ckpt_dir = run_output_dir ckpt_filename = remote_store.checkpoint_filename if logger is None: # Use default logger if no logger is supplied train_logger = TensorBoardLogger(logs_path) print(f"Setup logger: Using TensorBoardLogger: {train_logger}") elif isinstance(logger, CometLogger) and logger._experiment_key is None: # Resume logger experiment key if passed correctly from CPU. train_logger = CometLogger( save_dir=logs_path, api_key=logger.api_key, experiment_key=logger_experiment_key, ) print( f"Setup logger: Resume comet logger: {vars(train_logger)}") else: # use logger passed in. train_logger = logger train_logger.save_dir = logs_path print( f"Setup logger: Using logger passed from estimator: {train_logger}" ) # Lightning requires to add checkpoint callbacks for all ranks. # Otherwise we are seeing hanging in training. for cb in callbacks: if isinstance(cb, ModelCheckpoint): cb.dirpath = ckpt_dir cb.filename = ckpt_filename _checkpoint_callback = cb require_checkpoint = True break if not _checkpoint_callback: # By default 'monitor'=None which saves a checkpoint only for the last epoch. _checkpoint_callback = ModelCheckpoint(dirpath=ckpt_dir, filename=ckpt_filename, verbose=True) callbacks.append(_checkpoint_callback) if remote_store.saving_runs and hvd.rank() == 0: # Horovod: sync checkpoint and logging files only on rank 0 to # prevent other ranks from corrupting them. class _SyncCallback(Callback): def on_epoch_end(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -> None: remote_store.sync(run_output_dir) callbacks.append(_SyncCallback()) model = deserialize(serialized_model) _train_steps_per_epoch = train_steps_per_epoch if train_steps_per_epoch else \ int(math.floor(float(train_rows) / batch_size / hvd.size())) _val_steps_per_epoch = val_steps_per_epoch if val_steps_per_epoch else \ int(math.floor(float(val_rows) / val_batch_size / hvd.size())) if verbose: print( f"Training data of rank[{hvd.local_rank()}]: Epochs: {epochs}\n" f"Train rows: {train_rows}, Train batch size: {batch_size}, Train_steps_per_epoch: {_train_steps_per_epoch}\n" f"Val rows: {val_rows}, Val batch size: {val_batch_size}, Val_steps_per_epoch: {_val_steps_per_epoch}\n" f"Checkpoint file: {remote_store.checkpoint_path}, Logs dir: {remote_store.logs_path}\n" ) cuda_available = torch.cuda.is_available() # We need to check all ranks have same device type for traning. # Horovod doesn't support heterogeneous allreduce for gradients. cuda_avail_list = hvd.allgather_object(cuda_available, name='device type') if cuda_avail_list.count(cuda_available) != hvd.size(): raise RuntimeError("All ranks don't have same device type!") if cuda_available: # Horovod: pin GPU to local rank or the assigned GPU from spark. torch.cuda.set_device( _get_assigned_gpu_or_default(default=hvd.local_rank())) # Move model to GPU. model.cuda() _num_gpus = num_gpus if _num_gpus is None: _num_gpus = 1 if cuda_available else 0 # Set bar refresh to 1 / epoch, detailed loss and metrics is avaialbe in logger, # no need to print in screen here. User can still override this in trainer_args progress_bar_refresh_rate = _train_steps_per_epoch kwargs = { 'accelerator': 'horovod', 'gpus': _num_gpus, 'callbacks': callbacks, 'max_epochs': epochs, 'logger': train_logger, 'log_every_n_steps': log_every_n_steps, 'num_sanity_val_steps': 0, 'reload_dataloaders_every_epoch': False, 'progress_bar_refresh_rate': progress_bar_refresh_rate, 'terminate_on_nan': terminate_on_nan, 'profiler': profiler } if trainer_args: kwargs.update(trainer_args) if verbose and hvd.rank() == 0: print("Creating trainer with: \n ", kwargs) trainer = Trainer(**kwargs) if profiler != 'simple' and trainer.profiler: print( f"Set profiler's logs_path for {hvd.rank()} to {logs_path}" ) trainer.profiler.dirpath = logs_path # filename where the profiler results will be saved instead of # printing to stdout. The .txt extension will be used automatically. trainer.profiler.filename = "profile" if verbose and hvd.rank() == 0: print(f"pytorch_lightning version={pl.__version__}") data_module_kwargs = { 'train_dir': remote_store.train_data_path, 'val_dir': remote_store.val_data_path, 'num_train_epochs': epochs, 'has_val': should_validate is not None, 'train_batch_size': batch_size, 'val_batch_size': val_batch_size, 'shuffle_size': calculate_shuffle_buffer_size(), 'num_reader_epochs': loader_num_epochs, 'reader_pool_type': reader_pool_type, 'reader_worker_count': train_reader_worker_count, 'transform_spec': transformation, 'inmemory_cache_all': inmemory_cache_all, 'cur_shard': hvd.rank(), 'shard_count': hvd.size(), 'schema_fields': schema_fields, 'storage_options': storage_options, 'steps_per_epoch_train': _train_steps_per_epoch, 'steps_per_epoch_val': _val_steps_per_epoch, 'verbose': verbose, 'debug_data_loader': debug_data_loader, 'train_async_data_loader_queue_size': train_async_data_loader_queue_size, 'val_async_data_loader_queue_size': val_async_data_loader_queue_size, } if debug_data_loader and hvd.rank() == 0: print( f"Creating data module with args:\n {data_module_kwargs}") dataset = data_module(**data_module_kwargs) trainer.fit(model, dataset) if hvd.rank() == 0: if remote_store.saving_runs and trainer.profiler: # One more file sync to push profiler result. remote_store.sync(logs_path) # rank 0 overwrites model with best checkpoint and returns. if require_checkpoint: if verbose: print("load from checkpoint best model path:", _checkpoint_callback.best_model_path) best_model = model.load_from_checkpoint( _checkpoint_callback.best_model_path) else: best_model = model serialized_checkpoint = io.BytesIO() module = best_model if not is_legacy else best_model._model output = { 'model': module.state_dict(), 'logged_metrics': trainer.logged_metrics } torch.save(output, serialized_checkpoint) return serialized_checkpoint
transform=transformer['val']) valloader = DataLoader(valdataset, batch_size=2, pin_memory=True, num_workers=1) device = 'cpu' if __name__ == '__main__': mode = 'training' if mode == 'training': log_name = 'tiny_imagenet_logs/{}'.format(mode) logger = TensorBoardLogger( save_dir=os.getcwd(), name=log_name, # log_graph=True, # version=0 ) loss_callback = ModelCheckpoint( monitor='val_loss', dirpath='', filename='checkpoint-{epoch:02d}-{val_loss:.4f}', save_top_k=-1, mode='min', ) lr_monitor = LearningRateMonitor(logging_interval='epoch') callbacks = [loss_callback, lr_monitor] model = Model(backbone=backbone.MobileNetV2(num_classes=10), num_classes=10) # state_dict = model_zoo.load_url( # model_urls['mobilenet_v2'], progress=True)
num_classes = 3 img_size = 64 dm = ImDataModule( batch_size=batch_size, num_classes=num_classes, img_size=img_size, data_dir="/media/hdd/Datasets/DenseHaze/", ) class_ids = dm.setup() # # Logs model = LitModel(num_classes) logger = TensorBoardLogger(save_dir="logs") trainer = pl.Trainer( auto_select_gpus=True, gpus=1, precision=16, profiler=False, max_epochs=100, callbacks=[pl.callbacks.ProgressBar()], enable_pl_optimizer=True, logger=logger, accumulate_grad_batches=16, accelerator="ddp", plugins="ddp_sharded", )
def main( cfg: CfgNode, output_dir: Optional[str] = None, task_cls: Type[GeneralizedRCNNTask] = GeneralizedRCNNTask, eval_only: bool = False, num_machines: int = 1, num_gpus: int = 0, num_processes: int = 1, ) -> TrainOutput: """Main function for launching a training with lightning trainer Args: cfg: D2go config node num_machines: Number of nodes used for distributed training num_gpus: Number of GPUs to train on each node num_processes: Number of processes on each node. NOTE: Automatically set to the number of GPUs when using DDP. Set a value greater than 1 to mimic distributed training on CPUs. eval_only: True if run evaluation only. """ assert (num_processes == 1 or num_gpus == 0), "Only set num_processes > 1 when training on CPUs" maybe_override_output_dir(cfg, output_dir) task = task_cls.from_config(cfg, eval_only) tb_logger = TensorBoardLogger(save_dir=cfg.OUTPUT_DIR) trainer_params = { # training loop is bounded by max steps, use a large max_epochs to make # sure max_steps is met first "max_epochs": 10**8, "max_steps": cfg.SOLVER.MAX_ITER, "val_check_interval": cfg.TEST.EVAL_PERIOD if cfg.TEST.EVAL_PERIOD > 0 else cfg.SOLVER.MAX_ITER, "num_nodes": num_machines, "gpus": num_gpus, "num_processes": num_processes, "accelerator": get_accelerator(cfg.MODEL.DEVICE), "callbacks": _get_trainer_callbacks(cfg), "logger": tb_logger, "num_sanity_val_steps": 0, "progress_bar_refresh_rate": 10, } last_checkpoint = os.path.join(cfg.OUTPUT_DIR, "last.ckpt") if PathManager.exists(last_checkpoint): # resume training from checkpoint trainer_params["resume_from_checkpoint"] = last_checkpoint logger.info(f"Resuming training from checkpoint: {last_checkpoint}.") trainer = pl.Trainer(**trainer_params) model_configs = None if eval_only: do_test(trainer, task) else: model_configs = do_train(cfg, trainer, task) return TrainOutput( output_dir=cfg.OUTPUT_DIR, tensorboard_log_dir=tb_logger.log_dir, accuracy=task.eval_res, model_configs=model_configs, )
if __name__ == '__main__': os.environ['CUDA_VISIBLE_DEVICES'] = '0' parser = pl.Trainer.add_argparse_args(ArgumentParser()) parser.add_argument('--batch_size', default=512, type=int) parser.add_argument('--buffer_maxlen', default=1000000, type=int) #parser.add_argument('--max_episode', default=2000, type=int) parser.add_argument('--max_episode_len', default=200, type=int) parser.add_argument('--warm_start_steps', default=10000, type=int) parser.add_argument('--actor_lr', default=1e-4, type=float) parser.add_argument('--critic_lr', default=1e-3, type=float) parser.add_argument('--gamma', default=0.99, type=float) parser.add_argument('--sync_rate', default=8, type=int) parser.add_argument('--loading_weights', default=True) cfg = parser.parse_args() logger = TensorBoardLogger(save_dir=os.getcwd(), name='MADDPG_logs') checkpoint_callback = ModelCheckpoint( filepath=os.path.join(os.getcwd(), 'saved_checkpoints/'), save_top_k=3, verbose=False, monitor="loss", save_weights_only=True, ) trainer = pl.Trainer.from_argparse_args( cfg, gpus=1, #fast_dev_run=True, max_epochs=30001, profiler=True, logger=logger) #checkpoint_callback=checkpoint_callback)
def get_logger(logdir: Path) -> TensorBoardLogger: return TensorBoardLogger(str(logdir), name="unet")
def main(): """Main entry point of the program. Note: This main.py file is meant to be called using the cli, see the `examples/local/run.sh` file to see how to use it. """ parser = argparse.ArgumentParser() # __TODO__ check you need all the following CLI parameters parser.add_argument('--config', help='config file with generic hyper-parameters, such as optimizer, ' 'batch_size, ... - in yaml format') parser.add_argument('--data', help='path to data', required=True) parser.add_argument('--data-module', default="hdf5", help="Data module to use. file or hdf5") parser.add_argument('--tmp-folder', help='will use this folder as working folder - it will copy the input data ' 'here, generate results here, and then copy them back to the output ' 'folder') parser.add_argument('--output', help='path to outputs - will store files here', required=True) parser.add_argument('--disable-progressbar', action='store_true', help='will disable the progressbar while going over the mini-batch') parser.add_argument('--start-from-scratch', action='store_true', help='will not load any existing saved model - even if present') parser.add_argument('--debug', action='store_true') parser.add_argument("--embeddings-device",type=str,default="cuda",help="Which device to use for embeddings generation.") parser.add_argument('--embeddings', action='store_true',help="Skip training and generate embeddings for evaluation.") parser.add_argument('--embeddings-test', action='store_true',help="Skip training and generate test embeddings for evaluation.") parser.add_argument('--embeddings-ckpt', type=str, default=None, help="Checkpoint to load when generating embeddings.") #parser.add_argument("--embeddings") parser.add_argument("--dryrun", action="store_true", help="Dry-run by training on the validtion set. Use only to test loop code.") mlflow_save_dir = "./mlruns" # make into arg? tbx_save_dir = "./tensorboard" # make into arg? parser = pl.Trainer.add_argparse_args(parser) args = parser.parse_args() logging.basicConfig(stream=sys.stdout, level=logging.INFO) if not os.path.exists(args.output): os.makedirs(args.output) if args.tmp_folder is not None: data_folder_name = os.path.basename(os.path.normpath(args.data)) rsync_folder(args.data, args.tmp_folder) data_dir = os.path.join(args.tmp_folder, data_folder_name) output_dir = os.path.join(args.tmp_folder, 'output') if not os.path.exists(output_dir): os.makedirs(output_dir) else: data_dir = args.data output_dir = args.output # to intercept any print statement: sys.stdout = LoggerWriter(logger.info) sys.stderr = LoggerWriter(logger.warning) assert args.config is not None with open(args.config, 'r') as stream: hyper_params = load(stream, Loader=yaml.FullLoader) exp_name = hyper_params["exp_name"] output_dir = os.path.join(output_dir, exp_name) os.makedirs(output_dir, exist_ok=True) shutil.copyfile(args.config, os.path.join(output_dir, "config.backup")) assert "output_dir" not in hyper_params hyper_params["output_dir"] = output_dir os.makedirs(mlflow_save_dir, exist_ok=True) mlf_logger = MLFlowLogger( experiment_name=exp_name, save_dir=mlflow_save_dir, ) if os.path.exists(os.path.join(output_dir, STAT_FILE_NAME)): mlf_logger._run_id = load_mlflow(output_dir) logger.warning(f"WILL CONTINUE LOGGING IN MLFLOW RUN ID: {mlf_logger._run_id}") os.makedirs(tbx_save_dir, exist_ok=True) tbx_logger = TensorBoardLogger( save_dir=tbx_save_dir, name=exp_name, default_hp_metric=False, ) log_path = os.path.join(output_dir, "console.log") handler = logging.handlers.WatchedFileHandler(log_path) formatter = logging.Formatter(logging.BASIC_FORMAT) handler.setFormatter(formatter) root = logging.getLogger() root.setLevel(logging.INFO) root.addHandler(handler) mlflow.set_experiment(exp_name) mlflow.start_run(run_id=mlf_logger.run_id) run(args, data_dir, output_dir, hyper_params, mlf_logger, tbx_logger) mlflow.end_run() if args.tmp_folder is not None: rsync_folder(output_dir + os.path.sep, args.output)
def cli_main(): torch.multiprocessing.set_sharing_strategy('file_system') parser = argparse.ArgumentParser() parser.add_argument('--expr-name', default='autoregressive') parser.add_argument('--datapath', default='/data/tokenized__bert-base-cased.pkl') parser.add_argument('--encs-file', default='/path/to/encs.pt') parser.add_argument('--pretrained-retrieval-checkpoint', default='/path/to/best.ckpt') parser.add_argument('--default-root-dir', default='/output') parser.add_argument('--checkpoint-path', default=None) parser.add_argument('--token-limit', type=int, default=16384) parser.add_argument('--buffer-size', type=int, default=10000) parser.add_argument('--log-every', type=int, default=100) parser.add_argument('--max-epochs', type=int, default=50) parser.add_argument('--dataloader-workers', type=int, default=0) parser.add_argument('--model-type', default='bert-base-cased') parser.add_argument('--seed', type=int, default=42) parser.add_argument('--mode', type=str, default='train', choices=['train']) parser.add_argument('--check-val-every-n-epoch', type=int, default=5) parser.add_argument('--decode-max-length', type=int, default=20) parser.add_argument('--set-mode', type=int, default=1, choices=[0, 1]) parser.add_argument('--order', default='ground-truth', choices=['ground-truth']) parser.add_argument('--freeze-enc', type=int, default=0, choices=[0, 1]) parser.add_argument('--freeze-dec-emb', type=int, default=1, choices=[0, 1]) parser.add_argument('--freeze-dec', type=int, default=0, choices=[0, 1]) parser.add_argument('--init-enc', type=int, default=1, choices=[0, 1]) parser.add_argument('--init-dec-emb', type=int, default=1, choices=[0, 1]) parser.add_argument('--init-dec', type=int, default=1, choices=[0, 1]) parser.add_argument('--parallel', type=int, default=0, choices=[0, 1]) # --- Trainer/lightning parser.add_argument('--accumulate-grad-batches', type=int, default=1) parser.add_argument('--gradient-clip-val', type=float, default=1.0) parser.add_argument('--gpus', type=int, default=1) parser.add_argument('--accelerator', default='ddp') parser.add_argument('--precision', type=int, default=16) parser = SequenceRetriever.add_model_specific_args(parser) args = parser.parse_args() print(args) pl.seed_everything(args.seed) tokenizer = transformers.AutoTokenizer.from_pretrained(args.model_type) print("Loading data (%s)" % args.datapath) ds_raw = pickle.load(open(args.datapath, 'rb')) rid2tok = ds_raw['rid2tok'] dls = utils.get_dataloaders(ds_raw['tokenized'], xpad=tokenizer.pad_token_id, ypad=rid2tok['<pad>'], token_limit=args.token_limit, buffer_size=args.buffer_size, workers=args.dataloader_workers, set_mode=bool(args.set_mode), order=args.order) if args.parallel: from naturalproofs.encoder_decoder.model_joint import ParallelSequenceRetriever model = ParallelSequenceRetriever( vocab_size=len(rid2tok), bos=rid2tok['<bos>'], eos=rid2tok['<eos>'], xpad=tokenizer.pad_token_id, ypad=rid2tok['<pad>'], lr=args.lr, log_every=args.log_every, model_type=args.model_type, ) else: model = SequenceRetriever(vocab_size=len(rid2tok), bos=rid2tok['<bos>'], eos=rid2tok['<eos>'], xpad=tokenizer.pad_token_id, ypad=rid2tok['<pad>'], lr=args.lr, log_every=args.log_every, model_type=args.model_type, decode_max_length=args.decode_max_length) if args.checkpoint_path is not None: print("Resuming from checkpoint (%s)" % args.checkpoint_path) model.load_from_checkpoint(args.checkpoint_path) else: print("Initializing model using pretrained retrieval models") model.initialize( encs_file=args.encs_file, ckpt_file=args.pretrained_retrieval_checkpoint, dataset_rid2tok=rid2tok, init_enc=args.init_enc, init_dec_emb=args.init_dec_emb, init_dec=args.init_dec, ) model.freeze_parts( freeze_enc=args.freeze_enc, freeze_dec_emb=args.freeze_dec_emb, freeze_dec=args.freeze_dec, ) checkpoint_callback = ModelCheckpoint( save_last=True, save_top_k=1, monitor='val/mAP', mode='max', dirpath='%s/%s' % (args.default_root_dir, args.expr_name), filename='best-{val/mAP:.4f}') logger = TensorBoardLogger(save_dir='%s/tb_logs' % (args.default_root_dir), name=args.expr_name) trainer = pl.Trainer(check_val_every_n_epoch=args.check_val_every_n_epoch, callbacks=[checkpoint_callback], default_root_dir=args.default_root_dir, reload_dataloaders_every_epoch=True, move_metrics_to_cpu=True, gradient_clip_val=args.gradient_clip_val, gpus=args.gpus, accelerator=args.accelerator, precision=args.precision, resume_from_checkpoint=args.checkpoint_path, max_epochs=args.max_epochs, accumulate_grad_batches=args.accumulate_grad_batches, logger=logger) if args.mode == 'train': trainer.fit(model, dls['train'], dls['valid'])
datamodule = ChangeDetectionDataModule(args.data_dir) if args.backbone_type == 'random': backbone = resnet.resnet18(pretrained=False) elif args.backbone_type == 'imagenet': backbone = resnet.resnet18(pretrained=True) elif args.backbone_type == 'pretrain': model = MocoV2.load_from_checkpoint(args.ckpt_path) backbone = deepcopy(model.encoder_q) else: raise ValueError() model = SiamSegment(backbone, feature_indices=(0, 4, 5, 6, 7), feature_channels=(64, 64, 128, 256, 512)) model.example_input_array = (torch.zeros( (1, 3, 96, 96)), torch.zeros((1, 3, 96, 96))) experiment_name = args.backbone_type logger = TensorBoardLogger(save_dir=str(Path.cwd() / 'logs' / 'oscd'), name=experiment_name) checkpoint_callback = ModelCheckpoint(filename='{epoch}', save_weights_only=True) trainer = Trainer(gpus=args.gpus, logger=logger, callbacks=[checkpoint_callback], max_epochs=100, weights_summary='full') trainer.fit(model, datamodule=datamodule)
self.logger.experiment.add_scalar(f"Loss/Val{i}", loss, self.current_epoch) self.logger.experiment.add_scalar(f"WER/Val{i}", wer_metric, self.current_epoch) self._log_wers(outputs, f"Val{i}") return {'loss': final_loss} data_module = CVDataModule(batch_size=48) model = DSModule({}) logger = TensorBoardLogger('logs', name='DeepSpeech2') trainer = pl.Trainer( #fast_dev_run=True, logger=logger, gpus=(1 if torch.cuda.is_available() else 0)) trainer.fit(model, data_module) """ dataset_dev = CommonVoiceDataset(vocab=VocabEsp()) loader = DataLoader(dataset_dev, batch_size=2, collate_fn=PadCollate()) model = DeepSpeech2() features, sentences, fl, sl = next(iter(loader))
def test_tensorboard_log_metrics(tmpdir, step_idx): logger = TensorBoardLogger(tmpdir) metrics = {"float": 0.3, "int": 1, "FloatTensor": torch.tensor(0.1), "IntTensor": torch.tensor(1)} logger.log_metrics(metrics, step_idx)
def create_lightning_trainer(config: ModelConfigBase, resume_from_checkpoint: Optional[Path] = None) -> Tuple[Trainer, StoringLogger]: """ Creates a Pytorch Lightning Trainer object for the given model configuration. It creates checkpoint handlers and loggers. That includes a diagnostic logger for use in unit tests, that is also returned as the second return value. :param config: The model configuration. :param resume_from_checkpoint: If provided, training resumes from this checkpoint point. :return: A tuple [Trainer object, diagnostic logger] """ # For now, stick with the legacy behaviour of always saving only the last epoch checkpoint. For large segmentation # models, this still appears to be the best way of choosing them because validation loss on the relatively small # training patches is not stable enough. Going by the validation loss somehow works for the Prostate model, but # not for the HeadAndNeck model. best_checkpoint_callback = ModelCheckpoint(dirpath=str(config.checkpoint_folder), # filename=BEST_CHECKPOINT_FILE_NAME, # monitor=f"{VALIDATION_PREFIX}{MetricType.LOSS.value}", # save_top_k=1, save_last=True) # Recovery checkpoints: {epoch} will turn into a string like "epoch=1" # Store 1 recovery checkpoint every recovery_checkpoint_save_interval epochs. Due to a bug in Lightning, this # will still write alternate files recovery.ckpt and recovery-v0.ckpt, which are cleaned up later in # cleanup_checkpoint_folder recovery_checkpoint_callback = ModelCheckpoint(dirpath=str(config.checkpoint_folder), filename=RECOVERY_CHECKPOINT_FILE_NAME, period=config.recovery_checkpoint_save_interval ) num_gpus = torch.cuda.device_count() if config.use_gpu else 0 logging.info(f"Number of available GPUs: {num_gpus}") if config.max_num_gpus >= 0 and config.max_num_gpus < num_gpus: num_gpus = config.max_num_gpus logging.info(f"Restricting the number of GPUs to {num_gpus}") # Accelerator should be "ddp" when running large models in AzureML (when using DDP_spawn, we get out of GPU memory). # For unit tests, only "ddp_spawn" works accelerator = "ddp" if num_gpus > 1 else None logging.info(f"Using {num_gpus} GPUs with accelerator '{accelerator}'") storing_logger = StoringLogger() tensorboard_logger = TensorBoardLogger(save_dir=str(config.logs_folder), name="Lightning", version="") loggers = [storing_logger, tensorboard_logger, AzureMLLogger()] # This leads to problems with run termination. # if not is_offline_run_context(RUN_CONTEXT): # mlflow_logger = MLFlowLogger(experiment_name=RUN_CONTEXT.experiment.name, # tracking_uri=RUN_CONTEXT.experiment.workspace.get_mlflow_tracking_uri()) # # The MLFlow logger needs to get its ID from the AzureML run context, otherwise there will be two sets of # # results for each run, one from native AzureML and one from the MLFlow logger. # mlflow_logger._run_id = RUN_CONTEXT.id # loggers.append(mlflow_logger) # Use 32bit precision when running on CPU. Otherwise, make it depend on use_mixed_precision flag. precision = 32 if num_gpus == 0 else 16 if config.use_mixed_precision else 32 # The next two flags control the settings in torch.backends.cudnn.deterministic and torch.backends.cudnn.benchmark # https://pytorch.org/docs/stable/notes/randomness.html # For the classification models, we observed only a small performance deterioration (increase in 10sec on total # training time of 22min) when switching to deterministic. if config.pl_deterministic: deterministic = True benchmark = False else: deterministic = False benchmark = True trainer = Trainer(default_root_dir=str(config.outputs_folder), deterministic=deterministic, benchmark=benchmark, accelerator=accelerator, max_epochs=config.num_epochs, num_sanity_val_steps=config.pl_num_sanity_val_steps, callbacks=[best_checkpoint_callback, recovery_checkpoint_callback], logger=loggers, progress_bar_refresh_rate=0, # Disable the progress bar, gpus=num_gpus, precision=precision, sync_batchnorm=True, terminate_on_nan=config.detect_anomaly, resume_from_checkpoint=str(resume_from_checkpoint) if resume_from_checkpoint else None ) return trainer, storing_logger
pipeline = PipelineModule(opt) gpus = torch.cuda.device_count() print("Using {} GPU".format(gpus)) print("Writing results to {}".format(opt.path_out)) mkdir(opt.path_out) if opt.logger == "wandb": wandb.login() logger = pl.loggers.WandbLogger(save_dir=opt.path_out, name=opt.experiment_name, project=opt.project) logger.log_hyperparams(opt) logger.watch(pipeline) elif opt.logger == "tensorboard": logger = TensorBoardLogger(save_dir=opt.path_out, name=opt.experiment_name) trainer = pl.Trainer( logger, gpus=gpus, max_epochs=opt.max_epochs, default_root_dir=opt.path_out, terminate_on_nan=False, # Terminate on nan is expensive limit_val_batches=0.25, callbacks=[ImageLogCallback(opt), ModelCheckpoint()], fast_dev_run=opt.debug, resume_from_checkpoint=opt.checkpoint, weights_summary='top') if not opt.debug:
def test_validation_step_log_with_tensorboard(mock_log_metrics, tmpdir): """ This tests make sure we properly log_metrics to loggers """ class ExtendedModel(BoringModel): val_losses = [] def training_step(self, batch, batch_idx): output = self.layer(batch) loss = self.loss(batch, output) self.log('train_loss', loss) return {"loss": loss} def validation_step(self, batch, batch_idx): output = self.layer(batch) loss = self.loss(batch, output) self.val_losses.append(loss) self.log('valid_loss_0', loss, on_step=True, on_epoch=True) self.log('valid_loss_1', loss, on_step=False, on_epoch=True) self.log('valid_loss_2', loss, on_step=True, on_epoch=False) self.log('valid_loss_3', loss, on_step=False, on_epoch=False) return {"val_loss": loss} def test_step(self, batch, batch_idx): output = self.layer(batch) loss = self.loss(batch, output) self.log('test_loss', loss) return {"y": loss} model = ExtendedModel() model.validation_epoch_end = None # Initialize a trainer trainer = Trainer( default_root_dir=tmpdir, logger=TensorBoardLogger(tmpdir), limit_train_batches=2, limit_val_batches=2, limit_test_batches=2, max_epochs=2, progress_bar_refresh_rate=1, ) # Train the model ⚡ trainer.fit(model) # hp_metric + 2 steps + epoch + 2 steps + epoch expected_num_calls = 1 + 2 + 1 + 2 + 1 assert len(mock_log_metrics.mock_calls) == expected_num_calls assert mock_log_metrics.mock_calls[0] == call({'hp_metric': -1}, 0) def get_metrics_at_idx(idx): mock_calls = list(mock_log_metrics.mock_calls) if isinstance(mock_calls[idx].kwargs, dict): return mock_calls[idx].kwargs["metrics"] else: return mock_calls[idx][2]["metrics"] expected = [ 'valid_loss_0_step/epoch_0', 'valid_loss_2/epoch_0', 'global_step' ] assert sorted(get_metrics_at_idx(1)) == sorted(expected) assert sorted(get_metrics_at_idx(2)) == sorted(expected) expected = model.val_losses[2] assert get_metrics_at_idx(1)["valid_loss_0_step/epoch_0"] == expected expected = model.val_losses[3] assert get_metrics_at_idx(2)["valid_loss_0_step/epoch_0"] == expected expected = ['valid_loss_0_epoch', 'valid_loss_1', 'epoch', 'global_step'] assert sorted(get_metrics_at_idx(3)) == sorted(expected) expected = torch.stack(model.val_losses[2:4]).mean() assert get_metrics_at_idx(3)["valid_loss_1"] == expected expected = [ 'valid_loss_0_step/epoch_1', 'valid_loss_2/epoch_1', 'global_step' ] assert sorted(get_metrics_at_idx(4)) == sorted(expected) assert sorted(get_metrics_at_idx(5)) == sorted(expected) expected = model.val_losses[4] assert get_metrics_at_idx(4)["valid_loss_0_step/epoch_1"] == expected expected = model.val_losses[5] assert get_metrics_at_idx(5)["valid_loss_0_step/epoch_1"] == expected expected = ['valid_loss_0_epoch', 'valid_loss_1', 'epoch', 'global_step'] assert sorted(get_metrics_at_idx(6)) == sorted(expected) expected = torch.stack(model.val_losses[4:]).mean() assert get_metrics_at_idx(6)["valid_loss_1"] == expected results = trainer.test(model) expected_callback_metrics = { 'train_loss', 'valid_loss_0_epoch', 'valid_loss_0', 'debug_epoch', 'valid_loss_1', 'test_loss', 'val_loss' } assert set(trainer.callback_metrics) == expected_callback_metrics assert set(results[0]) == {'test_loss', 'debug_epoch'}
def main(args): utils.set_seed_everywhere(args.seed) cfg = hyperparameters.get_config(args) cfg.seed = args.seed cfg.base_dir = cfg.base_dir + "_s_" + str(args.seed) args.cuda = not args.no_cuda and torch.cuda.is_available() time_str = datetime.now( timezone('US/Eastern')).strftime("%Y-%m-%d-%H-%M-%S") exp_dir = os.path.join(cfg.base_dir, time_str) checkpoint_dir = os.path.join(exp_dir, cfg.checkpoint_dir) log_dir = os.path.join(exp_dir, cfg.log_dir) cfg.log_training = args.log_training cfg.log_training_path = os.path.join(exp_dir, args.log_training_path) cfg.num_steps = args.num_steps cfg.device = str(torch.device("cuda" if args.cuda else "cpu")) save_config(cfg, exp_dir, "config.json") print("Log path: ", log_dir, "Checkpoint Dir: ", checkpoint_dir) num_timsteps = cfg.observed_steps + cfg.predicted_steps data_shape = {'image': (None, num_timsteps, 3, 64, 64)} cfg.data_shapes = data_shape model = KeypointModel(cfg) cp_callback = ModelCheckpoint(filepath=os.path.join( checkpoint_dir, "model_"), period=25, save_top_k=-1) logger = TensorBoardLogger(log_dir, name="", version=None) gpus = 1 if args.cuda else None if args.pretrained_path: checkpoint_path = get_latest_checkpoint(args.pretrained_path) import json model = KeypointModel.load_from_checkpoint(checkpoint_path) print(json.dumps(model.cfg, indent=4)) print("On GPU Device: ", gpus) trainer = Trainer( max_epochs=10000, max_steps=args.num_steps, logger=logger, checkpoint_callback=cp_callback, gpus=gpus, #distributed_backend='dp', progress_bar_refresh_rate=1, #gradient_clip_val=cfg.clipnorm, fast_dev_run=False, #train_percent_check=0.1,val_percent_check=0.0, val_percent_check=0.3, track_grad_norm=2, num_sanity_val_steps=0, show_progress_bar=True) trainer.fit(model) save_path = os.path.join(checkpoint_dir, "model_final_" + str(args.num_steps) + ".ckpt") print("Saving model finally:") trainer.save_checkpoint(save_path)
#%% Load model import models vae = models.VAE(input_height=dataset.dims[2], num_labels=10, lr=0.001) #%% import utils from pytorch_lightning.callbacks import ModelCheckpoint from pytorch_lightning.loggers import TensorBoardLogger cb_imageplot = utils.PlotImage(batch) cb_checkpoint = ModelCheckpoint(monitor='val/loss', verbose=True, save_last=True) trainer = pl.Trainer( gpus=1, #auto_lr_find=True, logger=TensorBoardLogger('lightning_logs', name='coco-withtext'), callbacks=[cb_imageplot], checkpoint_callback=cb_checkpoint) #trainer.tune(vae, train_dataloader=dataset) #%% trainer.fit(vae, dataset) # %% Analysis #vae = models.VAE.load_from_checkpoint("lightning_logs/version_14/checkpoints/epoch=348.ckpt") # %% #%% Test batch out = vae(x, y) out['image'].min()
cfg = parse_config(args.cfg_file) data_cfg = get_data_info(cfg['data']) cfg['data'] = data_cfg args.cfg = cfg ckpt_fd = "{}".format( args.output_directory) + "/{epoch:02d}_{train_loss:.3f}_{val_acc:.3f}" ckpt_callback = pl.callbacks.model_checkpoint.ModelCheckpoint( filepath=ckpt_fd, verbose=True, save_top_k=-1) es_cb = pl.callbacks.EarlyStopping("val_acc", mode="max", verbose=True, patience=10) # can't get it to work with val_mAP for now, loss should do too prog = pl.loggers.CSVLogger(args.log_directory) tb_prog = TensorBoardLogger(args.log_directory) loggers = [prog, tb_prog] if args.use_mixers: raise DeprecationWarning( "running with --use_mixers is deprecated. Not doing anything") exit(1) mixer = mixers.BackgroundAddMixer() mixer = mixers.UseMixerWithProb(mixer, args.mixer_prob) args.tr_mixer = mixer # mixers.UseMixerWithProb(mixer, args.mixer_prob) else: args.tr_mixer = None tr_tfs = get_transforms_classifier(True, args.random_clip_size) val_tfs = get_transforms_classifier(False, args.val_clip_size, center_crop_val=True)
args.gpus = 1 # TODO: push to lightning args.gradient_clip_val = float(args.gradient_clip_val) groups = {} for group in parser._action_groups: group_dict = { a.dest: getattr(args, a.dest, None) for a in group._group_actions } groups[group.title] = argparse.Namespace(**group_dict) lightning_args = groups['lightning_options'] logger = TensorBoardLogger(lightning_args.default_root_dir, name=f'{exp_args.experiment}/{exp_args.model}') lightning_args.logger = logger hparams = groups['experiment'] model_params = groups['model'] for k, v in vars(model_params).items(): setattr(hparams, k, v) trainer = Trainer.from_argparse_args(lightning_args) model = model_class(**vars(model_params)) experiment = exp_class(hparams, model) if not args.validate: warnings.filterwarnings(
# loss_functions = {'mse': F.mse_loss, 'bce': F.binary_cross_entropy, 'mae': F.l1_loss} # loss_functions = {'mse': F.mse_loss, 'bce': F.binary_cross_entropy} loss_functions = {'bce': F.binary_cross_entropy} test_jet_indices = random.sample(range(0, len(jetsPL_test)), 50) train_loader = jet_dataloader_train for hs, channels, loss_k in itertools.product(hs_s, channels_s, loss_functions): autoencoder = lit_conv_ae_paper.LitAE(hidden_size=hs, channels=channels, loss=loss_functions[loss_k]) print('hidden_size', hs) print('channels', channels) print('loss', loss_k) #print('loss val', loss_functions[loss_k]) model_name = f'bin_{utils.N_IMAGE_BINS}_hs_{hs}_chan_{channels}_loss_{loss_k}_ss_{utils.SAMPLE_SIZE}_ne_{utils.NUM_EPOCHS}' logdir = f'tb_logs_{loss_k}' logger = TensorBoardLogger(logdir, name=model_name) trainer = pl.Trainer(max_epochs = utils.NUM_EPOCHS, logger=logger) trainer.fit(autoencoder, train_loader) output_jets_train = inference_jets(jetsDL_train, autoencoder) output_jets_test = inference_jets(jetsDL_test, autoencoder) #jet observables #pl-dl-dlue #scale back, move to numpy arrays # jetsPL_test = [unscale_hist(j.detach().numpy()) for j in jetsPL_test] # jetsDL_test = [unscale_hist(j.detach().numpy()) for j in jetsDL_test] # output_jets = [unscale_hist(j) for j in output_jets] # for j in jetsPL_test:
def main(cfg): # for subject in [25]: # cfg = load_cfg() run_pd = cfg['run_pd'] subjects_list = cfg['pd_subjects'] if run_pd else cfg['healthy_subjects'] if run_pd: med_str = '-on-med' if cfg['model_on_med'] else '-off-med' else: med_str = '' for subject in subjects_list: print('------------------------------------\nSubject', subject, '\n------------------------------------') # subject = 35, healthy # subject = 55, PD input_data, targets, long_labels = subject_nn_data( subject, healthy_subjects=cfg['healthy_subjects'], pd_subjects=cfg['pd_subjects'], feature_name=cfg['pred_feature'], data_path=cfg['data_path'], pd_dir=cfg['pd_dir'], healthy_dir=cfg['healthy_dir'], on_med=cfg['model_on_med'], use_silent_channels=cfg['use_silent_channels'], mask_value=cfg['mask_value']) freqs = ['alpha', 'beta', 'gamma'] freqs_idx = [0, 1, 2] split_idx_path = osp.join(cfg['outputs_path'], cfg['splits_path'], f'{subject}{med_str}-mlp.npy') indices_shuf = [] if cfg['subject_crossval']: indices = np.arange(input_data.shape[1]) for i in [1, 2, 3, 4, 5]: np.random.shuffle(indices) indices_shuf.append(indices.copy()) else: if osp.exists(split_idx_path): indices = np.load(split_idx_path) else: indices = np.arange(input_data.shape[1]) np.random.shuffle(indices) np.save(split_idx_path, indices) indices_shuf.append(indices) split_idx = int(input_data.shape[1] * 0.9) for shuf, indices in enumerate(indices_shuf): for freq in freqs_idx: # train-val split train_data = FlatEEGDataset( np_input=input_data[freq, indices[:split_idx], :], np_targets=targets[indices[:split_idx]]) val_data = FlatEEGDataset( np_input=input_data[freq, indices[split_idx:], :], np_targets=targets[indices[split_idx:]]) # data loaders train_loader = DataLoader(train_data, batch_size=cfg['batch_size'], shuffle=True, num_workers=0) val_loader = DataLoader(val_data, batch_size=len(val_data), shuffle=False, num_workers=0) # model idx_hparams = { 'n_features': input_data.shape[2], 'n_states': len(np.unique(targets)), 'n_hidden_nodes': cfg['n_hidden_nodes'], 'n_hidden_layers': cfg['n_hidden_layers'], 'lr': cfg['lr'], 'epochs': cfg['epochs'], 'freq_name': freqs[freq], 'pred_feature': cfg['pred_feature'], 'input_dropout': cfg['input_dropout'], 'mlp_dropout': cfg['mlp_dropout'], 'weight_decay': cfg['weight_decay'], 'num_classes': 3 } model = LitMlpClassifier(hparams=idx_hparams) # training cv = f'split{shuf}' if cfg['subject_crossval'] else '' prefix = 'pow-mean' if (cfg['mat_dict'] == 'dataSorted') else 'IC-MEAN' hparams_str = f"bs{cfg['batch_size']}_hn{cfg['n_hidden_nodes']}_lr{cfg['lr']}" logger = TensorBoardLogger( save_dir=osp.join(cfg['experiments_dir'], f"subject-{subject}"), name=f"freq-{freqs[freq]}-single_subject", version=f"MLP{cv}{med_str}-{prefix}_{hparams_str}_" f"{datetime.now().strftime('%Y-%m-%d_%H%M')}") trainer = pl.Trainer(max_epochs=cfg['epochs'], logger=logger) trainer.fit(model, train_loader, val_loader)
supervised=args.supervised, out_dim=train_dataset.n_classes, ) # ------------ # model # ------------ args.accelerator = "dp" if args.supervised: module = SupervisedLearning(args, encoder, output_dim=train_dataset.n_classes) else: module = ContrastiveLearning(args, encoder) logger = TensorBoardLogger("runs", name="CLMRv2-{}".format(args.dataset)) if args.checkpoint_path: module = module.load_from_checkpoint( args.checkpoint_path, encoder=encoder, output_dim=train_dataset.n_classes) else: # ------------ # training # ------------ if args.supervised: early_stopping = EarlyStopping(monitor='Valid/loss', patience=20) else: early_stopping = None
def get_log_dir( trainer: 'pytorch_lightning.Trainer', exp_dir: str = None, name: str = None, version: str = None, explicit_log_dir: str = None, use_datetime_version: bool = True, ) -> (Path, str, str, str): """ Obtains the log_dir used for exp_manager. Returns: log_dir (Path): the log_dir exp_dir (str): the base exp_dir without name nor version name (str): The name of the experiment version (str): The version of the experiment Raise: LoggerMisconfigurationError: If trainer is incompatible with arguments NotFoundError: If resume is True, resume_ignore_no_checkpoint is False, and checkpoints could not be found. ValueError: If resume is True, and there were more than 1 checkpoint could found. """ if explicit_log_dir: # If explicit log_dir was passed, short circuit return check_explicit_log_dir(trainer, explicit_log_dir, exp_dir, name, version) # Default exp_dir to ./nemo_experiments if None was passed _exp_dir = exp_dir if exp_dir is None: _exp_dir = str(Path.cwd() / 'nemo_experiments') # If the user has already defined a logger for the trainer, use the logger defaults for logging directory if trainer.logger is not None: if trainer.logger.save_dir: if exp_dir: raise LoggerMisconfigurationError( "The pytorch lightning trainer that was passed to exp_manager contained a logger, the logger's " f"save_dir was not None, and exp_dir ({exp_dir}) was not None. If trainer.logger.save_dir " "exists, exp_manager will use trainer.logger.save_dir as the logging directory and exp_dir " "must be None.") _exp_dir = trainer.logger.save_dir if name: raise LoggerMisconfigurationError( "The pytorch lightning trainer that was passed to exp_manager contained a logger, and name: " f"{name} was also passed to exp_manager. If the trainer contains a " "logger, exp_manager will use trainer.logger.name, and name passed to exp_manager must be None." ) name = trainer.logger.name version = f"version_{trainer.logger.version}" # Use user-defined exp_dir, project_name, exp_name, and versioning options else: name = name or "default" version = version or os.environ.get(NEMO_ENV_VARNAME_VERSION, None) if version is None: if trainer.is_slurm_managing_tasks: logging.warning( "Running on a slurm cluster. exp_manager will not add a version number." ) version = "" elif is_global_rank_zero(): if use_datetime_version: version = time.strftime('%Y-%m-%d_%H-%M-%S') else: tensorboard_logger = TensorBoardLogger( save_dir=Path(_exp_dir), name=name, version=version) version = f"version_{tensorboard_logger.version}" os.environ[NEMO_ENV_VARNAME_VERSION] = version log_dir = Path(_exp_dir) / Path(str(name)) / Path(str(version)) return log_dir, str(_exp_dir), name, version