def test_lr_logger_multi_lrs(tmpdir, logging_interval): """ Test that learning rates are extracted and logged for multi lr schedulers. """ tutils.reset_seed() model = EvalModelTemplate() model.configure_optimizers = model.configure_optimizers__multiple_schedulers lr_logger = LearningRateLogger(logging_interval=logging_interval) trainer = Trainer( default_root_dir=tmpdir, max_epochs=2, limit_val_batches=0.1, limit_train_batches=0.5, callbacks=[lr_logger], ) result = trainer.fit(model) assert result assert lr_logger.lrs, 'No learning rates logged' assert len(lr_logger.lrs) == len(trainer.lr_schedulers), \ 'Number of learning rates logged does not match number of lr schedulers' assert all([k in ['lr-Adam', 'lr-Adam-1'] for k in lr_logger.lrs.keys()]), \ 'Names of learning rates not set correctly' if logging_interval == 'step': expected_number_logged = trainer.global_step if logging_interval == 'epoch': expected_number_logged = trainer.max_epochs assert all(len(lr) == expected_number_logged for lr in lr_logger.lrs.values()), \ 'Length of logged learning rates do not match the expected number'
def main(hparams): cifar10_download.main() if not th.cuda.is_available(): hparams.cuda = False hparams.gpus = '0,' if hparams.cuda else None seed_everything(hparams.seed) # If only train on 1 GPU. Must set_device otherwise PyTorch always store model on GPU 0 first if type(hparams.gpus) == str: if len(hparams.gpus) == 2: # GPU number and comma e.g. '0,' or '1,' torch.cuda.set_device(int(hparams.gpus[0])) # Model classifier = CIFAR10_Module(hparams) # Trainer lr_logger = LearningRateLogger() logger = TensorBoardLogger("logs", name=hparams.classifier) trainer = Trainer(callbacks=[lr_logger], gpus=hparams.gpus, max_epochs=hparams.max_epochs, deterministic=True, early_stop_callback=False, logger=logger, checkpoint_callback=False, fast_dev_run=hparams.debug) if not hparams.eval: trainer.fit(classifier) else: trainer.test(classifier) if hparams.save_model: model = classifier.student_models[0] if hparams.num_students else classifier.teacher_model th.save(model.state_dict(), 'logs/{}.pt'.format(hparams.classifier))
def test_lr_logger_multi_lrs(tmpdir): """ Test that learning rates are extracted and logged for multi lr schedulers. """ tutils.reset_seed() model = EvalModelTemplate() model.configure_optimizers = model.configure_optimizers__multiple_schedulers lr_logger = LearningRateLogger() trainer = Trainer( default_root_dir=tmpdir, max_epochs=2, limit_val_batches=0.1, limit_train_batches=0.5, callbacks=[lr_logger] ) result = trainer.fit(model) assert result assert lr_logger.lrs, 'No learning rates logged' assert len(lr_logger.lrs) == len(trainer.lr_schedulers), \ 'Number of learning rates logged does not match number of lr schedulers' assert all([k in ['lr-Adam', 'lr-Adam-1'] for k in lr_logger.lrs.keys()]), \ 'Names of learning rates not set correctly' assert all(len(lr) == trainer.max_epochs for k, lr in lr_logger.lrs.items()), \ 'Length of logged learning rates exceeds the number of epochs'
def train(): config = get_transformer_encoder_config() model = TransformerModelEncoderLightning(config) gpu = 1 if torch.cuda.is_available() else None # define learning rate logger lr_logger = LearningRateLogger() tensorlogger = TensorBoardLogger("ts_logger", "transformer_encoder") # define early stopping callback early_stopping_callback = EarlyStopping(patience=3, verbose=True, mode="min") # define model checkpoint callback model_checkpoint_callback = ModelCheckpoint( filepath=join(tensorlogger.log_dir, "{epoch:02d}-{val_loss:.4f}"), period=1, save_top_k=3, ) trainer = pl.Trainer(max_epochs=10, gpus=gpu, gradient_clip_val=0.5, row_log_interval=200, check_val_every_n_epoch=1, reload_dataloaders_every_epoch=True, callbacks=[lr_logger], logger=tensorlogger, checkpoint_callback=model_checkpoint_callback, early_stop_callback=early_stopping_callback, progress_bar_refresh_rate=1) trainer.fit(model)
def main(): from pytorch_lightning.callbacks.model_checkpoint import ModelCheckpoint args = get_args() torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) model = LNNP(args) checkpoint_callback = ModelCheckpoint( filepath=args.log_dir, monitor="val_loss", save_top_k=8, period=args.eval_interval, ) lr_logger = LearningRateLogger() tb_logger = pl.loggers.TensorBoardLogger(args.log_dir) trainer = pl.Trainer(gpus=args.gpus, max_epochs=args.num_epochs, distributed_backend=args.distributed_backend, num_nodes=args.num_nodes, default_root_dir=args.log_dir, auto_lr_find=False, resume_from_checkpoint=args.load_model, checkpoint_callback=checkpoint_callback, callbacks=[lr_logger], logger=tb_logger, reload_dataloaders_every_epoch=False) trainer.fit(model) # run test set after completing the fit trainer.test()
def main(hparams): checkpoint = ModelCheckpoint(filepath='weights/{epoch}_fold=' + str(hparams.fold), save_top_k=1, monitor='val_loss', mode='min', verbose=True, prefix='') logger = TensorBoardLogger( 'pn_logs', name='fold={fold}'.format(fold=str(hparams.fold))) lr_logger = LearningRateLogger(logging_interval='step') model = Pneumothorax(hparams) trainer = pl.Trainer( max_epochs=hparams.epochs, gpus=[hparams.gpus], checkpoint_callback=checkpoint, accumulate_grad_batches=2, logger=logger, callbacks=[lr_logger], precision=16, ) trainer.fit(model)
def main(hparams): seed_everything(0) # If only train on 1 GPU. Must set_device otherwise PyTorch always store model on GPU 0 first if type(hparams.gpus) == str: if len(hparams.gpus) == 2: # GPU number and comma e.g. '0,' or '1,' torch.cuda.set_device(int(hparams.gpus[0])) # Model classifier = CIFAR10_Module(hparams) # Trainer lr_logger = LearningRateLogger() logger = TensorBoardLogger("logs", name=hparams.classifier) trainer = Trainer(callbacks=[lr_logger], gpus=hparams.gpus, max_epochs=hparams.max_epochs, deterministic=True, early_stop_callback=False, logger=logger) trainer.fit(classifier) # Load best checkpoint checkpoint_path = os.path.join(os.getcwd(), 'logs', hparams.classifier, 'version_' + str(classifier.logger.version),'checkpoints') classifier = CIFAR10_Module.load_from_checkpoint(os.path.join(checkpoint_path, os.listdir(checkpoint_path)[0])) # Save weights from checkpoint statedict_path = os.path.join(os.getcwd(), 'cifar10_models', 'state_dicts', hparams.classifier + '.pt') torch.save(classifier.model.state_dict(), statedict_path) # Test model trainer.test(classifier)
def get_trainer_kwargs(args): version = args.version if args.name is not None: version = f'{version}-{args.name}' if args.resume_from_checkpoint is not None and '/' not in args.resume_from_checkpoint: if args.resume_from_checkpoint.endswith(args.name): version = f'{args.version}-{args.resume_from_checkpoint}' else: version = f'{version}-{args.resume_from_checkpoint}' args.version = version os.makedirs(Path('data') / args.lang / 'runs' / args.version, exist_ok=True) logger = pl.loggers.TensorBoardLogger(save_dir=str( Path('data') / args.lang), name='runs', version=args.version) lr_logger = LearningRateLogger() checkpoint_callback = ModelCheckpoint(filepath=None, monitor='val_loss', mode='min', verbose=True, save_top_k=5, save_last=True, period=0) return { 'logger': logger, 'default_root_dir': 'data', 'callbacks': [lr_logger], 'checkpoint_callback': checkpoint_callback, 'replace_sampler_ddp': False }
def test_train_pipeline(fix_seed, config, gpus): config = OmegaConf.create(config) train_dataloader, test_dataloader = get_data_loaders(config=config) lr_logger = LearningRateLogger() model = build_model(model_conf=config.model) runner = Runner(model=model, config=config.runner) trainer = Trainer( distributed_backend=config.runner.trainer.distributed_backend, fast_dev_run=True, gpus=gpus, amp_level="O2", row_log_interval=10, callbacks=[lr_logger], max_epochs=1, weights_summary="top", reload_dataloaders_every_epoch=False, resume_from_checkpoint=None, benchmark=False, deterministic=True, num_sanity_val_steps=5, overfit_batches=0.0, precision=32, profiler=True, ) trainer.fit(model=runner, train_dataloader=train_dataloader, val_dataloaders=test_dataloader)
def train(dataset_name: str, model_name: str, expt_dir: str, data_folder: str, num_workers: int = 0, is_test: bool = False, resume_from_checkpoint: str = None): seed_everything(SEED) dataset_main_folder = data_folder vocab = Vocabulary.load(join(dataset_main_folder, "vocabulary.pkl")) if model_name == "code2seq": config_function = get_code2seq_test_config if is_test else get_code2seq_default_config config = config_function(dataset_main_folder) model = Code2Seq(config, vocab, num_workers) model.half() #elif model_name == "code2class": # config_function = get_code2class_test_config if is_test else get_code2class_default_config # config = config_function(dataset_main_folder) # model = Code2Class(config, vocab, num_workers) else: raise ValueError(f"Model {model_name} is not supported") # define logger wandb_logger = WandbLogger(project=f"{model_name}-{dataset_name}", log_model=True, offline=True) wandb_logger.watch(model) # define model checkpoint callback model_checkpoint_callback = ModelCheckpoint( filepath=join(expt_dir, "{epoch:02d}-{val_loss:.4f}"), period=config.hyperparams.save_every_epoch, save_top_k=3, ) # define early stopping callback early_stopping_callback = EarlyStopping( patience=config.hyperparams.patience, verbose=True, mode="min") # use gpu if it exists gpu = 1 if torch.cuda.is_available() else None # define learning rate logger lr_logger = LearningRateLogger() trainer = Trainer( max_epochs=20, gradient_clip_val=config.hyperparams.clip_norm, deterministic=True, check_val_every_n_epoch=config.hyperparams.val_every_epoch, row_log_interval=config.hyperparams.log_every_epoch, logger=wandb_logger, checkpoint_callback=model_checkpoint_callback, early_stop_callback=early_stopping_callback, resume_from_checkpoint=resume_from_checkpoint, gpus=gpu, callbacks=[lr_logger], reload_dataloaders_every_epoch=True, ) trainer.fit(model) trainer.save_checkpoint(join(expt_dir, 'Latest.ckpt')) trainer.test()
def generic_train( model: BaseTransformer, args: argparse.Namespace, early_stopping_callback=False, logger=True, # can pass WandbLogger() here extra_callbacks=[], checkpoint_callback=None, logging_callback=None, **extra_train_kwargs): pl.seed_everything(args.seed) # init model odir = Path(model.hparams.output_dir) odir.mkdir(exist_ok=True) # add custom checkpoints if checkpoint_callback is None: checkpoint_callback = pl.callbacks.ModelCheckpoint( filepath=args.output_dir, prefix="checkpoint", monitor="val_loss", mode="min", save_top_k=1) if logging_callback is None: logging_callback = LoggingCallback() train_params = {} # TODO: remove with PyTorch 1.6 since pl uses native amp if args.fp16: train_params["precision"] = 16 train_params["amp_level"] = args.fp16_opt_level if args.gpus > 1: train_params["distributed_backend"] = "ddp" train_params["accumulate_grad_batches"] = args.accumulate_grad_batches lr_logger = LearningRateLogger(logging_interval='step') # deterministic=True, trainer = pl.Trainer.from_argparse_args( args, weights_summary='full', callbacks=[logging_callback, lr_logger], logger=logger, checkpoint_callback=checkpoint_callback, early_stop_callback=early_stopping_callback, num_sanity_val_steps=4, **train_params, ) trainer.lr_logger = lr_logger if args.do_train: trainer.fit(model) return trainer
def init_trainer(): """ Init a Lightning Trainer using from_argparse_args Thus every CLI command (--gpus, distributed_backend, ...) become available. """ parser = ArgumentParser() parser = Trainer.add_argparse_args(parser) args = parser.parse_args() lr_logger = LearningRateLogger() return Trainer.from_argparse_args(args, callbacks=[lr_logger])
def main(): args = parse_args() cfg = Config.fromfile(args.config) setup_seed(cfg.random_seed) model = LightningModel(cfg) checkpoint_callback = ModelCheckpoint( filepath=f"{cfg.checkpoint_path}/{cfg.name}/{cfg.version}/" f"{cfg.name}_{cfg.version}_{{epoch}}_{{avg_val_loss:.3f}}_{{ade:.3f}}_{{fde:.3f}}_{{fiou:.3f}}", save_last=True, save_top_k=8, verbose=True, monitor='fiou', mode='max', prefix='') lr_logger_callback = LearningRateLogger(logging_interval='step') logger = TensorBoardLogger(save_dir=cfg.log_path, name=cfg.name, version=cfg.version) logger.log_hyperparams(model.hparams) profiler = SimpleProfiler() if cfg.simple_profiler else AdvancedProfiler() check_val_every_n_epoch = cfg.check_val_every_n_epoch if hasattr( cfg, 'check_val_every_n_epoch') else 1 trainer = pl.Trainer( gpus=cfg.num_gpus, max_epochs=cfg.max_epochs, logger=logger, profiler=profiler, # this line won't work in multi-gpu setting. weights_summary="top", gradient_clip_val=cfg.gradient_clip_val, callbacks=[lr_logger_callback], checkpoint_callback=checkpoint_callback, resume_from_checkpoint=cfg.resume_from_checkpoint, accumulate_grad_batches=cfg.batch_size_times, check_val_every_n_epoch=check_val_every_n_epoch) if (not (args.train or args.test)) or args.train: shutil.copy( args.config, os.path.join(cfg.log_path, cfg.name, cfg.version, args.config.split('/')[-1])) if cfg.load_from_checkpoint is not None: model_ckpt = partial_state_dict(model, cfg.load_from_checkpoint) model.load_state_dict(model_ckpt) trainer.fit(model) if args.test: if cfg.test_checkpoint is not None: model_ckpt = partial_state_dict(model, cfg.test_checkpoint) model.load_state_dict(model_ckpt) trainer.test(model)
def setup_callbacks_loggers(args): log_path = Path('/home/yyousfi1/LogFiles/comma/') name = args.backbone version = args.version tb_logger = TensorBoardLogger(log_path, name=name, version=version) lr_logger = LearningRateLogger(logging_interval='epoch') ckpt_callback = ModelCheckpoint(filepath=Path(tb_logger.log_dir)/'checkpoints/{epoch:02d}_{val_loss:.4f}', save_top_k=10, save_last=True) return ckpt_callback, tb_logger, lr_logger
def main(args): logger = pl_loggers.WandbLogger(experiment="example", save_dir=None) early_stop = EarlyStopping(monitor="val_loss") checkpoint_callback = ModelCheckpoint(dirpath="ckpts/", monitor="val_loss") model = ExampleModel(args) lr_logger = LearningRateLogger() trainer = Trainer.from_argparse_args( args, logger=logger, callbacks=[early_stop, lr_logger], checkpoint_callback=checkpoint_callback) trainer.fit(model)
def setup_callbacks_loggers(args): log_path = Path('/home/yyousfi1/LogFiles/OneHotConv/') log_path = log_path / args.qf / args.stego_scheme / args.payload name = args.backbone version = args.version tb_logger = TensorBoardLogger(log_path, name=name, version=version) lr_logger = LearningRateLogger(logging_interval='epoch') ckpt_callback = ModelCheckpoint(filepath=Path(tb_logger.log_dir) / 'checkpoints/{epoch:02d}_{val_FC_acc:.3f}', save_top_k=5, save_last=True) return ckpt_callback, tb_logger, lr_logger
def main(cfg: DictConfig) -> None: print(cfg.pretty()) neptune_logger = CustomNeptuneLogger(params=flatten_dict( OmegaConf.to_container(cfg, resolve=True)), **cfg.logging.neptune_logger) tb_logger = loggers.TensorBoardLogger(**cfg.logging.tb_logger) lr_logger = LearningRateLogger() # TODO change to cyclicLR per epochs my_callback = MyCallback(cfg) model = get_model(cfg) if cfg.model.ckpt_path is not None: ckpt_pth = glob.glob(utils.to_absolute_path(cfg.model.ckpt_path)) model = load_pytorch_model(ckpt_pth[0], model) seed_everything(2020) # TODO change to enable logging losses lit_model = O2UNetSystem(hparams=cfg, model=model) checkpoint_callback_conf = OmegaConf.to_container( cfg.callbacks.model_checkpoint, resolve=True) checkpoint_callback = ModelCheckpoint(**checkpoint_callback_conf) early_stop_callback_conf = OmegaConf.to_container(cfg.callbacks.early_stop, resolve=True) early_stop_callback = EarlyStopping(**early_stop_callback_conf) trainer = Trainer( checkpoint_callback=checkpoint_callback, early_stop_callback=early_stop_callback, logger=[tb_logger, neptune_logger], # logger=[tb_logger], callbacks=[lr_logger, my_callback], **cfg.trainer) # TODO change to train with all data datasets = get_datasets(OmegaConf.to_container(cfg, resolve=True)) train_dataset = datasets["train"] valid_dataset = datasets["valid"] trainer.fit( lit_model, train_dataloader=DataLoader(train_dataset, **cfg["training"]["dataloader"]["train"]), val_dataloaders=DataLoader(valid_dataset, **cfg["training"]["dataloader"]["valid"]))
def test_lr_logger_no_lr(tmpdir): tutils.reset_seed() model = EvalModelTemplate() lr_logger = LearningRateLogger() trainer = Trainer(default_root_dir=tmpdir, max_epochs=2, limit_val_batches=0.1, train_percent_check=0.5, callbacks=[lr_logger]) with pytest.warns(RuntimeWarning): result = trainer.fit(model) assert result
def test_tbd_remove_in_v0_11_0_trainer(): with pytest.deprecated_call(match='will be removed in v0.11.0'): LearningRateLogger() with pytest.deprecated_call(match='will be removed in v0.11.0'): trainer = Trainer(row_log_interval=8) assert trainer.log_every_n_steps == 8 with pytest.deprecated_call(match='will be removed in v0.11.0'): assert trainer.row_log_interval == 8 with pytest.deprecated_call(match='will be removed in v0.11.0'): trainer = Trainer(log_save_interval=9) assert trainer.flush_logs_every_n_steps == 9 with pytest.deprecated_call(match='will be removed in v0.11.0'): assert trainer.log_save_interval == 9
def init_trainer(): """ Init a Lightning Trainer using from_argparse_args Thus every CLI command (--gpus, distributed_backend, ...) become available. """ parser = ArgumentParser() parser = Trainer.add_argparse_args(parser) args = parser.parse_args() lr_logger = LearningRateLogger() early_stopping = EarlyStopping(monitor='val_loss', mode='min', min_delta=0.001, patience=10, verbose=True) return Trainer.from_argparse_args(args, callbacks=[lr_logger, early_stopping])
def train(file_path, train_ratio=0.8, optimizer="AdamW", intent_optimizer_lr=1e-4, entity_optimizer_lr=2e-4, epochs=20, batch_size=None, gpu_num=0, distributed_backend=None, checkpoint_prefix='morphine_model_'): early_stopping = EarlyStopping('val_loss') lr_logger = LearningRateLogger() checkpoint_callback = model_checkpoint.ModelCheckpoint( prefix=checkpoint_prefix) prepare_data_per_node = True if 0 <= gpu_num < 2: prepare_data_per_node = False if batch_size is None: trainer = Trainer(auto_scale_batch_size="power", max_epochs=epochs, gpus=gpu_num, distributed_backend=distributed_backend, early_stop_callback=early_stopping, callbacks=[lr_logger], checkpoint_callback=checkpoint_callback, prepare_data_per_node=prepare_data_per_node) else: trainer = Trainer(max_epochs=epochs, gpus=gpu_num, distributed_backend=distributed_backend, early_stop_callback=early_stopping, callbacks=[lr_logger], checkpoint_callback=checkpoint_callback, prepare_data_per_node=prepare_data_per_node) model_args = {} model_args["epochs"] = epochs model_args["batch_size"] = batch_size model_args["nlu_data"] = open(file_path, encoding="utf-8").readlines() model_args["train_ratio"] = train_ratio model_args["optimizer"] = optimizer model_args["intent_optimizer_lr"] = intent_optimizer_lr model_args["entity_optimizer_lr"] = entity_optimizer_lr hparams = Namespace(**model_args) model = MorphineClassifier(hparams) trainer.fit(model)
def train(omegaConf: DictConfig) -> LightningModule: # Misc part if omegaConf['runner']['verbose'] is True: print(OmegaConf.to_yaml(omegaConf)) pl.seed_everything(omegaConf['runner']['seed']) # Runner part runner = make_runner(omegaConf['runner']) if "auto_lr_find" in omegaConf['trainer'] and omegaConf['trainer']['auto_lr_find'] is True: runner = custom_lr_finder(runner, omegaConf) # When we are here, the omegaConf has already been checked by OmegaConf # so we can extract primitives to use with other libs config = OmegaConf.to_container(omegaConf) assert isinstance(config, dict) config['trainer']['default_root_dir'] = check_default_root_dir(config) config['trainer']['checkpoint_callback'] = build_checkpoint_callback(config) if 'logger' in config['trainer']: config['trainer']['logger'] = build_logger(config) if 'deterministic' in config['trainer']: config['trainer']['deterministic'] = True if 'profiler' in config['trainer'] and config['trainer']['profiler'] is True: config['trainer']['profiler'] = AdvancedProfiler() if 'scheduler' in config['runner'] and config['runner']['scheduler'] is not None: lr_monitor = LearningRateLogger(logging_interval='step') config['trainer']['callbacks'] = [lr_monitor] # ### # # Early stopping # # It is breaking neptune logging somehow, it seems that it overrides by 1 the current timestep # ### # early_stop_callback = EarlyStopping( # monitor='val_accuracy', min_delta=0.00, patience=10, verbose=False, mode='max' # ) # config['trainer']['early_stop_callback'] = early_stop_callback trainer = pl.Trainer(**config['trainer']) trainer.fit(runner) return runner
def train_regression(hparams): if hparams.model == "UNetDS_Attention": net = unet_regr.UNetDS_Attention(hparams=hparams) elif hparams.model == "UNet_Attention": net = unet_regr.UNet_Attention(hparams=hparams) elif hparams.model == "UNet": net = unet_regr.UNet(hparams=hparams) elif hparams.model == "UNetDS": net = unet_regr.UNetDS(hparams=hparams) else: raise NotImplementedError(f"Model '{hparams.model}' not implemented") torchsummary.summary(net, (12, 288, 288), device="cpu") # return default_save_path = "lightning/precip_regression" checkpoint_callback = ModelCheckpoint( filepath=os.getcwd() + "/" + default_save_path + "/" + net.__class__.__name__ + "/{epoch}-{val_loss:.6f}", save_top_k=-1, verbose=False, monitor='val_loss', mode='min', prefix=net.__class__.__name__ + "_rain_threshhold_50_") lr_logger = LearningRateLogger() tb_logger = loggers.TensorBoardLogger(save_dir=default_save_path, name=net.__class__.__name__) earlystopping_callback = EarlyStopping( monitor='val_loss', mode='min', patience=hparams. es_patience, # is effectively half (due to a bug in pytorch-lightning) ) trainer = pl.Trainer(fast_dev_run=hparams.fast_dev_run, gpus=hparams.gpus, weights_summary=None, max_epochs=hparams.epochs, default_save_path=default_save_path, checkpoint_callback=checkpoint_callback, early_stop_callback=earlystopping_callback, logger=tb_logger, callbacks=[lr_logger], resume_from_checkpoint=hparams.resume_from_checkpoint, val_check_interval=hparams.val_check_interval, overfit_pct=hparams.overfit_pct) trainer.fit(net)
def train(model_name: str, n_cr: int, num_workers: int = 0, is_test: bool = False, resume_from_checkpoint: str = None): seed_everything(SEED) if model_name == "improved_gan": config_function = get_gan_test_config if is_test else get_gan_default_config config = config_function(n_cr) model = GAN(config, num_workers, improved=True) elif model_name == "default_gan": config_function = get_gan_test_config if is_test else get_gan_default_config config = config_function(n_cr) model = GAN(config, num_workers, improved=False) else: raise ValueError(f"Model {model_name} is not supported") # define logger wandb_logger = WandbLogger(project="GAN", log_model=True, offline=is_test) wandb_logger.watch(model, log="all") # define model checkpoint callback model_checkpoint_callback = ModelCheckpoint( filepath=join(wandb.run.dir, "{epoch:02d}-{val_loss:.4f}"), period=config.save_every_epoch, save_top_k=3, ) # use gpu if it exists gpu = 1 if torch.cuda.is_available() else None # define learning rate logger lr_logger = LearningRateLogger() trainer = Trainer( max_epochs=config.n_epochs, deterministic=True, check_val_every_n_epoch=config.val_every_epoch, row_log_interval=config.log_every_epoch, logger=wandb_logger, checkpoint_callback=model_checkpoint_callback, resume_from_checkpoint=resume_from_checkpoint, gpus=gpu, callbacks=[lr_logger], reload_dataloaders_every_epoch=True, ) trainer.fit(model) trainer.test()
def main(cfg: DictConfig): LOG.info("Config:\n" + OmegaConf.to_yaml(cfg)) seed_everything(cfg.seed) system = EpicActionRecognitionSystem(cfg) if not cfg.get("log_graph", True): # MTRN can't be traced due to the model stochasticity so causes a JIT tracer # error, we allow you to prevent the tracer from running to log the graph when # the summary writer is created system.example_input_array = None # type: ignore data_module = EpicActionRecogintionDataModule(cfg) lr_logger = LearningRateLogger(logging_interval="step") checkpoint_callback = ModelCheckpoint(save_last=True) # with ipdb.launch_ipdb_on_exception(): trainer = Trainer(callbacks=[lr_logger], checkpoint_callback=checkpoint_callback, **cfg.trainer) trainer.fit(system, datamodule=data_module)
def main(hparams): pl.seed_everything(42) checkpoint_callback = ModelCheckpoint(save_top_k=-1) wandb_logger = WandbLogger(project='video-colorization', tags=["colornet"], name='SLURM', log_model=True) hparams.logger = wandb_logger lr_logger = LearningRateLogger() colornet = model.ColorNet(hparams) trainer = pl.Trainer.from_argparse_args( hparams, checkpoint_callback=checkpoint_callback, callbacks=[lr_logger]) trainer.fit(colornet)
def main(hparams): hparams.data_path += "x" experiments = {"0": [1], "1": [1], "2": [1], "3": [1], "4": [1]} loss_weights = {"1": 1.0, "5": 3.25, "21": 5.5} for seed, labels in experiments.items(): for amount_labels in labels: splitted_path = hparams.data_path.split("/") splitted_path[-1] = str(seed) hparams.data_path = "/".join(splitted_path) hparams.amount_labels = amount_labels hparams.loss_weight = loss_weights[str(amount_labels)] model = RecRob(hparams) name = "recrob" logger = loggers.TensorBoardLogger(save_dir=hparams.output_dir, name=name, version="{}-{}".format( str(amount_labels), str(seed)), log_graph=True) lr_logger = LearningRateLogger(logging_interval="step") trainer = Trainer( default_root_dir=logger.log_dir + "/checkpoints/", logger=logger, log_save_interval=10, callbacks=[lr_logger], gpus=hparams.gpus, tpu_cores=hparams.tpu_cores, fast_dev_run=hparams.fast_dev_run, max_epochs=hparams.max_epochs, auto_lr_find=hparams.auto_lr_find, gradient_clip_val=hparams.gradient_clip_val, check_val_every_n_epoch=hparams.check_val_every_n_epoch, amp_level=hparams.amp_level, accumulate_grad_batches=hparams.accumulate_grad_batches) print("Hyperparameter:") print("_______________") print(json.dumps(vars(hparams), indent=4)) trainer.fit(model) test_result = trainer.test(model) trainer.logger.save()
def run(self, args=None): args = self.parse_args(args) seed_everything(args.seed) pprint.pprint('args') pprint.pprint(args.__dict__) pprint.pprint('*********************') checkpoint_callback = ModelCheckpoint(monitor='valid_loss', verbose=True, save_last=True) logger.info(args) lr_logger = LearningRateLogger() trainer = Trainer( default_root_dir=args.default_root_dir, progress_bar_refresh_rate=args.progress_bar_refresh_rate, min_epochs=args.min_epochs, max_epochs=args.max_epochs, val_check_interval=args.val_check_interval, limit_val_batches=args.limit_val_batches, gpus=args.gpus, distributed_backend=args.distributed_backend, row_log_interval=1, amp_level=args.amp_level, precision=args.precision, num_nodes=args.num_nodes, tpu_cores=args.tpu_cores, accumulate_grad_batches=args.accumulate_grad_batches, checkpoint_callback=checkpoint_callback, resume_from_checkpoint=args.resume_from_checkpoint, fast_dev_run=args.fast_dev_run, callbacks=[lr_logger], ) model = self.get_model(args) logger.info(f'Start Training model {model}') logger.info('') trainer.fit(model) logger.info('Training loop finished.') return trainer
def main(args: Namespace, model_cls) -> None: if args.seed is not None: pl.seed_everything(args.seed) if args.distributed_backend == 'ddp': # When using a single GPU per process and per # DistributedDataParallel, we need to divide the batch size # ourselves based on the total number of GPUs we have args.batch_size = int(args.batch_size / max(1, args.gpus)) args.workers = int(args.workers / max(1, args.gpus)) model = model_cls(**vars(args)) lr_logger = LearningRateLogger(logging_interval='step') trainer = pl.Trainer.from_argparse_args(args, callbacks=[lr_logger]) if args.evaluate: trainer.test(model) else: trainer.fit(model)
def test_lr_logger_param_groups(tmpdir): """ Test that learning rates are extracted and logged for single lr scheduler""" tutils.reset_seed() model = EvalModelTemplate() model.configure_optimizers = model.configure_optimizers__param_groups lr_logger = LearningRateLogger() trainer = Trainer(default_root_dir=tmpdir, max_epochs=5, val_percent_check=0.1, train_percent_check=0.5, callbacks=[lr_logger]) results = trainer.fit(model) assert lr_logger.lrs, 'No learning rates logged' assert len(lr_logger.lrs) == 2 * len(trainer.lr_schedulers), \ 'Number of learning rates logged does not match number of param groups' assert all([k in ['lr-Adam/pg1', 'lr-Adam/pg2'] for k in lr_logger.lrs.keys()]), \ 'Names of learning rates not set correctly'