def test_deepspeed_run_configure_optimizers(tmpdir): """Test end to end that deepspeed works with defaults (without ZeRO as that requires compilation), whilst using configure_optimizers for optimizers and schedulers.""" class TestCB(Callback): def on_train_start(self, trainer, pl_module) -> None: from deepspeed.runtime.zero.stage2 import FP16_DeepSpeedZeroOptimizer assert isinstance(trainer.optimizers[0], FP16_DeepSpeedZeroOptimizer) assert isinstance(trainer.optimizers[0].optimizer, torch.optim.SGD) assert isinstance(trainer.lr_schedulers[0]["scheduler"], torch.optim.lr_scheduler.StepLR) # check that the lr_scheduler config was preserved assert trainer.lr_schedulers[0]["name"] == "Sean" # Ensure DeepSpeed engine has initialized with our lr_scheduler assert isinstance(trainer.model.lr_scheduler, torch.optim.lr_scheduler.StepLR) class TestModel(BoringModel): def configure_optimizers(self): [optimizer], [scheduler] = super().configure_optimizers() return { "optimizer": optimizer, "lr_scheduler": { "scheduler": scheduler, "name": "Sean" } } model = TestModel() lr_monitor = LearningRateMonitor() trainer = Trainer( plugins=DeepSpeedPlugin( ), # disable ZeRO so our optimizers are not wrapped default_root_dir=tmpdir, gpus=1, fast_dev_run=True, precision=16, callbacks=[TestCB(), lr_monitor], ) trainer.fit(model) assert lr_monitor.lrs == {"Sean": [0.1]} _assert_save_model_is_equal(model, tmpdir, trainer)
def training_loop(train, valid, save_path, pl_module, callbacks, n_epochs, checkpoint_callback, use_neptune=False, resume=True, limit_train_batches=None, neptune_tags="", neptune_name=""): """ Largely model/application agnostic training code. """ # Train with proper resuming # Copy gin configs used, for reference, to the save folder if not limit_train_batches: limit_train_batches = len(train) os.system("rm " + os.path.join(save_path, "*gin")) for gin_config in sys.argv[2].split(";"): os.system("cp {} {}/base_config.gin".format(gin_config, save_path)) with open(os.path.join(save_path, "config.gin"), "w") as f: f.write(gin.operative_config_str()) hparams = parse_gin_config(os.path.join(save_path, 'config.gin')) if 'train.callbacks' in hparams: del hparams['train.callbacks'] # TODO: What is a less messy way to pass hparams? This is only that logging is aware of hyperparameters pl_module._set_hparams(hparams) pl_module._hparams_initial = copy.deepcopy(hparams) loggers = [] loggers.append(pl_loggers.CSVLogger(save_path)) if use_neptune: from pytorch_lightning.loggers import NeptuneLogger loggers.append(NeptuneLogger( api_key=NEPTUNE_TOKEN, project_name=NEPTUNE_USER + "/" + NEPTUNE_PROJECT, experiment_name=neptune_name if len(neptune_name) else os.path.basename(save_path), tags=neptune_tags.split(',') if len(neptune_tags) else None, )) callbacks += [MetaSaver(), Heartbeat(), LearningRateMonitor()] trainer = pl.Trainer( default_root_dir=save_path, limit_train_batches=limit_train_batches, max_epochs=n_epochs, logger=loggers, callbacks=callbacks, log_every_n_steps=1, checkpoint_callback=checkpoint_callback, resume_from_checkpoint=os.path.join(save_path, 'last.ckpt') if resume and os.path.exists(os.path.join(save_path, 'last.ckpt')) else None) trainer.fit(pl_module, train, valid) return trainer
def callback_objects(model_config, lr_logger=False): callback_list = model_config["callbacks"] callback_list = handle_config_cases(callback_list) model_set = model_config["set"] model_library = model_config["model_library"] callback_object_list = [ find_model(model_set, callback, model_library)() for callback in callback_list ] if lr_logger: lr_monitor = LearningRateMonitor(logging_interval='epoch') callback_object_list = callback_object_list + [lr_monitor] logging.info("Callbacks found") return callback_object_list
def generateModel(): CHECKPOINT_ROOT = BASE / "checkpoints" model = AlexnetFineTuning((3,224,224)) lr_monitor = LearningRateMonitor(logging_interval='epoch') CHECKPOINT_PATH = CHECKPOINT_ROOT checkpoint_callback = ModelCheckpoint( dirpath = CHECKPOINT_PATH, filename = '{epoch}-{val_acc:.2f}', monitor = "val_acc", save_last = True, mode = "max", save_top_k = -1, period = 300 ) trainer = pl.Trainer(gpus=1, callbacks=[checkpoint_callback,lr_monitor],max_epochs=EPOCHS, progress_bar_refresh_rate=0, weights_summary=None) trainer.fit(model, train_loader, val_loader) trainer.test(test_dataloaders=test_loader)
def test_lr_monitor_no_lr_scheduler(tmpdir): tutils.reset_seed() class CustomBoringModel(BoringModel): def configure_optimizers(self): optimizer = optim.SGD(self.parameters(), lr=0.1) return optimizer model = CustomBoringModel() lr_monitor = LearningRateMonitor() trainer = Trainer( default_root_dir=tmpdir, max_epochs=2, limit_val_batches=0.1, limit_train_batches=0.5, callbacks=[lr_monitor] ) with pytest.warns(RuntimeWarning, match="have no learning rate schedulers"): trainer.fit(model) assert trainer.state.finished, f"Training failed with {trainer.state}"
def test_lr_monitor_no_lr_scheduler(tmpdir): tutils.reset_seed() model = EvalModelTemplate() lr_monitor = LearningRateMonitor() trainer = Trainer( default_root_dir=tmpdir, max_epochs=2, limit_val_batches=0.1, limit_train_batches=0.5, callbacks=[lr_monitor], ) with pytest.warns(RuntimeWarning, match='have no learning rate schedulers'): result = trainer.fit(model) assert result
def configure_callbacks(self): cwd = hydra.utils.get_original_cwd() filename = "{}_{}_{}_H{}_W{}".format( self.args.exp_name, self.args.arch.decoder, self.args.arch.encoder, self.args.arch.image_height, self.args.arch.image_width, ) checkpoint_callback = ModelCheckpoint( monitor="val_loss", mode="min", dirpath=os.path.join(cwd, self.args.path2weight), filename=filename, save_top_k=1, ) lr_monitor = LearningRateMonitor(logging_interval="step") return [checkpoint_callback, lr_monitor]
def main(args): pl.seed_everything(42) # load configs with open(args.cfg_path, 'r') as fp: cfg = json.load(fp) cfg_train = cfg['train'] print('### TRAIN CONFIGS:') pprint(cfg_train) print('### MODEL CONFIGS:') pprint(cfg['model']) #os.environ['CUDA_VISIBLE_DEVICES'] = cfg_train['trainer_kwargs']['gpus'] # init model ModelClass = { 'cvae': CvaeInception, 'vae': VaeInception }[cfg_train['type']] model = ModelClass(cfg['model']) # init data loader dm = UrbanSoundsModule( cfg['dataset'], num_workers=cfg_train['num_workers'], batch_size=cfg_train['batch_size']) dm.setup() # logger log_name = 'usnds_{}_{}'.format(ModelClass.model_name, cfg_train['descr']) logger = TensorBoardLogger(save_dir='logs', name=log_name) # callbacks early_stop = EarlyStopping(monitor='val_loss', patience=cfg_train['patience']) lr_monitor = LearningRateMonitor(logging_interval='epoch') # trainer trainer = pl.Trainer( max_epochs=cfg_train['max_epochs'], logger=logger, callbacks=[early_stop, lr_monitor], **cfg_train['trainer_kwargs']) # train trainer.fit(model=model, datamodule=dm)
def main(): pl.seed_everything(1234) # ------------ # args # ------------ parser = get_parser() parser = MrcSpanProposal.add_model_specific_args(parser) parser = pl.Trainer.add_argparse_args(parser) args = parser.parse_args() # ------------ # model # ------------ model = MrcSpanProposal(args) # load pretrained_model if args.pretrained: model.load_state_dict( torch.load(args.pretrained, map_location=torch.device('cpu'))["state_dict"] ) # call backs checkpoint_callback = ModelCheckpoint( monitor=f'val_top{MrcSpanProposal.acc_topk}_acc', dirpath=args.default_root_dir, save_top_k=10, save_last=True, mode='max', verbose=True ) lr_monitor = LearningRateMonitor(logging_interval='step') print_model = ModelPrintCallback(print_modules=["model"]) callbacks = [checkpoint_callback, lr_monitor, print_model] if args.freeze_bert: callbacks.append(EvalCallback(["model.bert"])) trainer = pl.Trainer.from_argparse_args( args, callbacks=callbacks, replace_sampler_ddp=False ) trainer.fit(model)
def test_lr_monitor_no_lr_scheduler_multi_lrs(tmpdir, logging_interval: str): """Test that learning rates are extracted and logged for multi optimizers but no lr scheduler.""" tutils.reset_seed() class CustomBoringModel(BoringModel): def training_step(self, batch, batch_idx, optimizer_idx): return super().training_step(batch, batch_idx) def configure_optimizers(self): optimizer1 = optim.Adam(self.parameters(), lr=1e-2) optimizer2 = optim.Adam(self.parameters(), lr=1e-2) return [optimizer1, optimizer2] model = CustomBoringModel() model.training_epoch_end = None lr_monitor = LearningRateMonitor(logging_interval=logging_interval) log_every_n_steps = 2 trainer = Trainer( default_root_dir=tmpdir, max_epochs=2, log_every_n_steps=log_every_n_steps, limit_train_batches=7, limit_val_batches=0.1, callbacks=[lr_monitor], ) trainer.fit(model) assert lr_monitor.lrs, "No learning rates logged" assert len(lr_monitor.lrs) == len(trainer.optimizers) assert list(lr_monitor.lrs) == [ "lr-Adam", "lr-Adam-1" ], "Names of learning rates not set correctly" if logging_interval == "step": # divide by 2 because we have 2 optimizers expected_number_logged = trainer.global_step // 2 // log_every_n_steps if logging_interval == "epoch": expected_number_logged = trainer.max_epochs assert all( len(lr) == expected_number_logged for lr in lr_monitor.lrs.values())
def train(cfg): datamodule_args = {} if cfg.transforms.train: train_transforms = [ hydra.utils.instantiate(t) for t in cfg.transforms.train ] datamodule_args["train_transforms"] = train_transforms if cfg.transforms.val: val_transforms = [ hydra.utils.instantiate(t) for t in cfg.transforms.val ] datamodule_args["val_transforms"] = val_transforms data_module = hydra.utils.instantiate(cfg.dataset, **datamodule_args) data_module.prepare_data() lr_scheduler = resolve_steps_per_epoch(cfg, len_train=data_module.len_train) model = FlowerClassifier( **cfg.model, optimizer_config=cfg.optimizer, lr_scheduler_config=lr_scheduler, batch_size=cfg.dataset.batch_size, ) logger = hydra.utils.instantiate(cfg.trainer.logger) or False lr_logger = LearningRateMonitor(logging_interval="step") callbacks = [lr_logger] # checkpoint callback requires dynamic configuration experiment = getattr(logger, "experiment", None) logger_dir = getattr(experiment, "dir", "logger") checkpoints_dir = os.path.join(logger_dir, "{epoch}") checkpoint_callback = hydra.utils.instantiate( cfg.trainer.checkpoint_callback, filepath=checkpoints_dir) or False if checkpoint_callback: callbacks.append(checkpoint_callback) trainer_args = { **cfg.trainer, "logger": logger, "callbacks": callbacks, } trainer = Trainer(**trainer_args) trainer.fit(model, datamodule=data_module)
def test_lr_monitor_custom_name(tmpdir): class TestModel(BoringModel): def configure_optimizers(self): optimizer, [scheduler] = super().configure_optimizers() lr_scheduler = {'scheduler': scheduler, 'name': 'my_logging_name'} return optimizer, [lr_scheduler] lr_monitor = LearningRateMonitor() trainer = Trainer( default_root_dir=tmpdir, max_epochs=2, limit_val_batches=0.1, limit_train_batches=0.5, callbacks=[lr_monitor], progress_bar_refresh_rate=0, weights_summary=None, ) trainer.fit(TestModel()) assert lr_monitor.lr_sch_names == list(lr_monitor.lrs.keys()) == ['my_logging_name']
def train(args, trainer_args, model_args): # df = pd.read_csv(os.path.join(args['data_directory'], 'train.csv')) # train_df, val_df = model_selection.train_test_split(df, test_size=0.1, random_state=42, stratify=df.label.values) # train_df.reset_index(inplace=True, drop=True) # val_df.reset_index(inplace=True, drop=True) datamodule = CassavaDataModule(batch_size=args['batch_size'], data_dir=args['data_directory'], num_workers=4, sample_size=args['sample_size']) classifier_list = [Resnet18, Resnet50, EfficientNetB1, VisTransformer] classifier_names = [elem.__name__.lower() for elem in classifier_list] classifier_model_name = args['model_type'] classifier = classifier_list[classifier_names.index(classifier_model_name)] classifier_model_dir = os.path.join('logs', classifier_model_name) #trainer_args = {'max_epochs' :8, 'profiler' : 'simple', 'precision' :16, 'gradient_clip_val' : 100, 'gpus':1 } #model_args = {'lr' : 5e-5} load_pretrained = False load_pretrained = os.path.exists(classifier_model_dir) and load_pretrained checkpoints = list( filter(lambda x: '.ckpt' in x, os.listdir(classifier_model_dir))) if load_pretrained else [] load_pretrained = load_pretrained and len(checkpoints) > 0 if load_pretrained: checkpoint_path = os.path.join(classifier_model_dir, checkpoints[-1]) model = classifier.load_from_checkpoint(checkpoint_path) else: model = classifier(**model_args) print(model) logger = TensorBoardLogger("logs", name=classifier_model_name, log_graph=True) lr_monitor = LearningRateMonitor(logging_interval='step') model_chkpt = ModelCheckpoint(dirpath=classifier_model_dir, monitor='val_acc_epoch', filename='{epoch}-{val_acc_epoch:.2f}', verbose=True) early_stopper = EarlyStopping(monitor='val_acc_epoch', patience=6, verbose=True) trainer = pl.Trainer(logger=logger, callbacks=[lr_monitor, model_chkpt, early_stopper], **trainer_args) trainer.fit(model, datamodule)
def test_lightning_cli_config_and_subclass_mode(tmpdir): input_config = { "fit": { "model": { "class_path": "tests.helpers.BoringModel" }, "data": { "class_path": "tests.helpers.BoringDataModule", "init_args": { "data_dir": str(tmpdir) } }, "trainer": { "default_root_dir": str(tmpdir), "max_epochs": 1, "weights_summary": None }, } } config_path = tmpdir / "config.yaml" with open(config_path, "w") as f: f.write(yaml.dump(input_config)) with mock.patch("sys.argv", ["any.py", "--config", str(config_path)]): cli = LightningCLI( BoringModel, BoringDataModule, subclass_mode_model=True, subclass_mode_data=True, trainer_defaults={"callbacks": LearningRateMonitor()}, ) config_path = tmpdir / "lightning_logs" / "version_0" / "config.yaml" assert os.path.isfile(config_path) with open(config_path) as f: loaded_config = yaml.safe_load(f.read()) loaded_config = loaded_config["fit"] cli_config = cli.config["fit"] assert loaded_config["model"] == cli_config["model"] assert loaded_config["data"] == cli_config["data"] assert loaded_config["trainer"] == cli_config["trainer"]
def get_callbacks( logging_interval: str, experiment_type: str, save_top_k: int = 1, period: int = 1, monitor: str = "checkpoint_saving_loss", ): upload_comet_logs = UploadCometLogs( logging_interval, get_console_logger("callback"), experiment_type ) lr_monitor = LearningRateMonitor(logging_interval=logging_interval) # saving the best model as per the validation loss. checkpoint_callback = UpdatedModelCheckpoint( save_top_k=save_top_k, period=period, monitor=monitor ) return { "callbacks": [lr_monitor, upload_comet_logs], "checkpoint_callback": checkpoint_callback, }
def test_lr_monitor_single_lr(tmpdir): """Test that learning rates are extracted and logged for single lr scheduler.""" tutils.reset_seed() model = BoringModel() lr_monitor = LearningRateMonitor() trainer = Trainer(default_root_dir=tmpdir, max_epochs=2, limit_val_batches=0.1, limit_train_batches=0.5, callbacks=[lr_monitor]) trainer.fit(model) assert lr_monitor.lrs, "No learning rates logged" assert all(v is None for v in lr_monitor.last_momentum_values.values() ), "Momentum should not be logged by default" assert len(lr_monitor.lrs) == len(trainer.lr_scheduler_configs) assert list(lr_monitor.lrs) == ["lr-SGD"]
def test_lr_monitor_custom_name(tmpdir): class TestModel(BoringModel): def configure_optimizers(self): optimizer, [scheduler] = super().configure_optimizers() lr_scheduler = {"scheduler": scheduler, "name": "my_logging_name"} return optimizer, [lr_scheduler] lr_monitor = LearningRateMonitor() trainer = Trainer( default_root_dir=tmpdir, max_epochs=2, limit_val_batches=0.1, limit_train_batches=0.5, callbacks=[lr_monitor], enable_progress_bar=False, enable_model_summary=False, ) trainer.fit(TestModel()) assert list(lr_monitor.lrs) == ["my_logging_name"]
def _get_trainer_callbacks(cfg: CfgNode) -> List[Callback]: """Gets the trainer callbacks based on the given D2Go Config. Args: cfg: The normalized ConfigNode for this D2Go Task. Returns: A list of configured Callbacks to be used by the Lightning Trainer. """ callbacks: List[Callback] = [ LearningRateMonitor(logging_interval="step"), ModelCheckpoint( dirpath=cfg.OUTPUT_DIR, save_last=True, ), ] if cfg.QUANTIZATION.QAT.ENABLED: callbacks.append(QuantizationAwareTraining.from_config(cfg)) return callbacks
def train(params: Params): seed_everything(params.d.seed) tb_logger = TensorBoardLogger( params.t.save_dir, name=f'011_popularity', version=str(int(time())), ) log_dir = Path(tb_logger.log_dir) log_dir.mkdir(parents=True, exist_ok=True) logger = getLogger('lightning') logger.addHandler(FileHandler(log_dir / 'train.log')) logger.info(params.pretty()) callbacks = [ LearningRateMonitor(), ] if params.t.checkpoint_callback: callbacks.append(ModelCheckpoint( monitor=None, verbose=True, ), ) trainer = pl.Trainer( max_epochs=params.t.epochs, gpus=params.t.gpus, tpu_cores=params.t.num_tpu_cores, logger=tb_logger, precision=params.t.precision, resume_from_checkpoint=params.t.resume_from_checkpoint, weights_save_path=params.t.weights_save_path, checkpoint_callback=params.t.weights_save_path is not None, callbacks=callbacks, deterministic=True, benchmark=True, accumulate_grad_batches=params.t.accumulate_grad_batches, val_check_interval=params.t.val_check_interval, ) net = PLModule(params.m.to_dict()) dm = PopularityDataModule(params.d) trainer.fit(net, datamodule=dm)
def test_lr_monitor_single_lr_with_momentum(tmpdir, opt): """ Test that learning rates and momentum are extracted and logged for single lr scheduler. """ class LogMomentumModel(BoringModel): def __init__(self, opt): super().__init__() self.opt = opt def configure_optimizers(self): if self.opt == 'SGD': opt_kwargs = {'momentum': 0.9} elif self.opt == 'Adam': opt_kwargs = {'betas': (0.9, 0.999)} optimizer = getattr(optim, self.opt)(self.parameters(), lr=1e-2, **opt_kwargs) lr_scheduler = optim.lr_scheduler.OneCycleLR(optimizer, max_lr=1e-2, total_steps=10_000) return [optimizer], [lr_scheduler] model = LogMomentumModel(opt=opt) lr_monitor = LearningRateMonitor(log_momentum=True) trainer = Trainer( default_root_dir=tmpdir, max_epochs=2, limit_val_batches=2, limit_train_batches=5, log_every_n_steps=1, callbacks=[lr_monitor], ) trainer.fit(model) assert trainer.state == TrainerState.FINISHED, f"Training failed with {trainer.state}" assert all(v is not None for v in lr_monitor.last_momentum_values.values()), \ 'Expected momentum to be logged' assert len(lr_monitor.last_momentum_values) == len(trainer.lr_schedulers), \ 'Number of momentum values logged does not match number of lr schedulers' assert all(k == f'lr-{opt}-momentum' for k in lr_monitor.last_momentum_values.keys()), \ 'Names of momentum values not set correctly'
def train_and_test(model, dm, logger): ES = EarlyStopping( monitor="train_loss", min_delta=0.001, patience=10, verbose=True, mode="min", strict=True, ) LRM = LearningRateMonitor("epoch") tag = "unsupervised_v1" CKPT = ModelCheckpoint( dirpath="/gaia/models", filename=tag, monitor="train_loss", mode="min", verbose=True, ) LSS = LatentSpaceSaver() trainer = pl.Trainer( auto_lr_find=False, gpus=1, auto_select_gpus=False, gradient_clip_val=1.0, log_gpu_memory="min_max", reload_dataloaders_every_epoch=True, callbacks=[ES, LRM, CKPT, LSS, Plotter()], accelerator="ddp", log_every_n_steps=150, flush_logs_every_n_steps=300, terminate_on_nan=True, track_grad_norm=2, weights_summary="full", profiler="simple", replace_sampler_ddp=True, logger=logger, ) # trainer.tune(model=model, datamodule=dm) trainer.fit(model=model, datamodule=dm) trainer.test(model=model, datamodule=dm, verbose=True)
def train(args): print('Training arguments: ', args) seed_everything(args.seed) os.makedirs(args.log_dir, exist_ok=True) data = SNLIData(batch_size=args.batch_size) train_loader, val_loader, test_loader = data.get_iters() checkpoint_callback = ModelCheckpoint(monitor='val_loss') trainer = Trainer( default_root_dir=args.log_dir, limit_train_batches=args. limit_train_batches, # for testing with less data fast_dev_run=False, # for checking with 1 batch, callbacks=[ LearningRateMonitor(logging_interval='step'), checkpoint_callback ], logger=TensorBoardLogger(args.log_dir, name=args.encoder_type), gpus=1 if torch.cuda.is_available() else 0, max_epochs=args.epochs, progress_bar_refresh_rate=args.refresh_rate) model = NLINet(encoder_type=args.encoder_type, enc_hidden_dim=args.enc_hidden_dim, cls_hidden_dim=args.cls_hidden_dim, lr=args.lr, dataset_sizes=data.sizes) # Training trainer.fit(model, train_loader, val_loader) print('Best checkpoint:', checkpoint_callback.best_model_path) # Testing # model = NLINet.load_from_checkpoint( # trainer.checkpoint_callback.best_model_path) test_result = trainer.test(model, test_dataloaders=test_loader, verbose=True) return test_result
def test_lr_monitor_custom_pg_name(tmpdir): class TestModel(BoringModel): def configure_optimizers(self): optimizer = torch.optim.SGD([{"params": list(self.layer.parameters()), "name": "linear"}], lr=0.1) lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1) return [optimizer], [lr_scheduler] lr_monitor = LearningRateMonitor() trainer = Trainer( default_root_dir=tmpdir, max_epochs=2, limit_val_batches=2, limit_train_batches=2, callbacks=[lr_monitor], progress_bar_refresh_rate=0, weights_summary=None, ) trainer.fit(TestModel()) assert lr_monitor.lr_sch_names == ["lr-SGD"] assert list(lr_monitor.lrs) == ["lr-SGD/linear"]
def create_trainer(self, logger=None, callbacks=[], **kwargs): """Create a pytorch lightning training by reading config files Args: callbacks (list): a list of pytorch-lightning callback classes """ #If val data is passed, monitor learning rate and setup classification metrics if not self.config["validation"]["csv_file"] is None: if logger is not None: lr_monitor = LearningRateMonitor(logging_interval='epoch') callbacks.append(lr_monitor) self.trainer = pl.Trainer(logger=logger, max_epochs=self.config["train"]["epochs"], gpus=self.config["gpus"], enable_checkpointing=False, accelerator=self.config["distributed_backend"], fast_dev_run=self.config["train"]["fast_dev_run"], callbacks=callbacks, **kwargs)
def train(): model = SDDSegModel() checkpoint_callback = pl.callbacks.ModelCheckpoint( dirpath='checkpoints_sdd/deeplabv3_effnet-b2/', save_top_k=1, verbose=True, monitor='val_loss', mode='min', prefix='') lr_monitor = LearningRateMonitor(logging_interval='step') trainer = pl.Trainer( gpus=1, callbacks=[lr_monitor], checkpoint_callback=checkpoint_callback, num_sanity_val_steps=-1, log_every_n_steps=4, max_epochs=22, resume_from_checkpoint= 'checkpoints_sdd/deeplabv3_effnet-b2/epoch=21-step=2639.ckpt') trainer.fit(model)
def _get_trainer(project_parameters): callbacks = [ ModelCheckpoint(monitor='validation accuracy', mode='max'), LearningRateMonitor(logging_interval='epoch', log_momentum=True) ] if project_parameters.use_early_stopping: callbacks.append( EarlyStopping(monitor='validation loss', patience=project_parameters.patience, mode='min')) return Trainer(callbacks=callbacks, gpus=project_parameters.gpus, max_epochs=project_parameters.train_iter, weights_summary=project_parameters.weights_summary, profiler=project_parameters.profiler, deterministic=True, check_val_every_n_epoch=project_parameters.val_iter, default_root_dir=project_parameters.save_path, num_sanity_val_steps=0, precision=project_parameters.precision)
def train_regression(args_lightning_model_parameters, epochs, gpus=1, es_patience=30): net = UNet(**args_lightning_model_parameters) torchsummary.summary(net, (12, 288, 288), device="cpu") # return default_save_path = "output/lightning/precip_regression" if not os.path.exists(default_save_path): os.makedirs(default_save_path) checkpoint_callback = ModelCheckpoint( filepath=os.getcwd() + "/" + default_save_path + "/" + net.__class__.__name__ + "/{epoch}-{val_loss:.6f}", save_top_k=-1, verbose=False, monitor='val_loss', mode='min', prefix=net.__class__.__name__ + "_rain_threshhold_50_") lr_logger = LearningRateMonitor() tb_logger = loggers.TensorBoardLogger(save_dir=default_save_path, name=net.__class__.__name__) earlystopping_callback = EarlyStopping( monitor='val_loss', mode='min', patience=es_patience, # is effectively half (due to a bug in pytorch-lightning) ) trainer = pl.Trainer(gpus=gpus, weights_summary=None, max_epochs=epochs, weights_save_path=default_save_path, logger=tb_logger, callbacks=[lr_logger, earlystopping_callback], val_check_interval=0.25, overfit_batches=0.1) # resume_from_checkpoint=resume_from_checkpoint, trainer.fit(net) return
def test_lr_monitor_param_groups(tmpdir): """Test that learning rates are extracted and logged for single lr scheduler.""" tutils.reset_seed() class CustomClassificationModel(ClassificationModel): def configure_optimizers(self): param_groups = [ { "params": list(self.parameters())[:2], "lr": self.lr * 0.1 }, { "params": list(self.parameters())[2:], "lr": self.lr }, ] optimizer = optim.Adam(param_groups) lr_scheduler = optim.lr_scheduler.StepLR(optimizer, 1, gamma=0.1) return [optimizer], [lr_scheduler] model = CustomClassificationModel() dm = ClassifDataModule() lr_monitor = LearningRateMonitor() trainer = Trainer(default_root_dir=tmpdir, max_epochs=2, limit_val_batches=0.1, limit_train_batches=0.5, callbacks=[lr_monitor]) trainer.fit(model, datamodule=dm) assert trainer.state.finished, f"Training failed with {trainer.state}" assert lr_monitor.lrs, "No learning rates logged" assert len(lr_monitor.lrs) == 2 * len( trainer.lr_schedulers ), "Number of learning rates logged does not match number of param groups" assert lr_monitor.lr_sch_names == ["lr-Adam"] assert list(lr_monitor.lrs.keys()) == [ "lr-Adam/pg1", "lr-Adam/pg2" ], "Names of learning rates not set correctly"
def test_lr_monitor_single_lr(tmpdir): """Test that learning rates are extracted and logged for single lr scheduler.""" tutils.reset_seed() model = BoringModel() lr_monitor = LearningRateMonitor() trainer = Trainer( default_root_dir=tmpdir, max_epochs=2, limit_val_batches=0.1, limit_train_batches=0.5, callbacks=[lr_monitor] ) trainer.fit(model) assert trainer.state.finished, f"Training failed with {trainer.state}" assert lr_monitor.lrs, "No learning rates logged" assert all(v is None for v in lr_monitor.last_momentum_values.values()), "Momentum should not be logged by default" assert len(lr_monitor.lrs) == len( trainer.lr_schedulers ), "Number of learning rates logged does not match number of lr schedulers" assert ( lr_monitor.lr_sch_names == list(lr_monitor.lrs.keys()) == ["lr-SGD"] ), "Names of learning rates not set correctly"
def train(self, log_dir=os.path.abspath(os.path.join(os.path.dirname(__file__), '../data/logs/ocr')), seed: int = None ) -> NPOcrNet: """ TODO: describe method """ if seed is not None: aug_seed(seed) pl.seed_everything(seed) self.create_model() checkpoint_callback = ModelCheckpoint(dirpath=log_dir, monitor='val_loss') lr_monitor = LearningRateMonitor(logging_interval='step') self.trainer = pl.Trainer(max_epochs=self.epochs, gpus=self.gpus, callbacks=[checkpoint_callback, lr_monitor], weights_summary=None) self.trainer.fit(self.model, self.dm) print("[INFO] best model path", checkpoint_callback.best_model_path) self.trainer.test() return self.model