def train_model(model, model_dir): # Setup trainer cb1 = callbacks.ModelCheckpoint(filename='best-{epoch}', monitor='val_loss_mean', save_top_k=1, mode='min') cb2 = callbacks.ModelCheckpoint(filename='last-{epoch}', save_last=True) tb_logger = pl_loggers.TensorBoardLogger('{}/logs/'.format(model_dir)) if Constants.n_gpus != 0: #trainer = Trainer(gpus=Constants.n_gpus, distributed_backend='ddp', logger = tb_logger, precision=16, default_root_dir=model_dir, max_epochs=n_epochs) trainer = Trainer(gpus=Constants.n_gpus, callbacks=[cb1, cb2], plugins=DDPPlugin(find_unused_parameters=False), accelerator='ddp_spawn', precision=16, logger=tb_logger, default_root_dir=model_dir, max_epochs=n_epochs) else: trainer = Trainer(gpus=0, default_root_dir=model_dir, callbacks=[cb1, cb2], logger=tb_logger, distributed_backend='ddp_spawn', max_epochs=n_epochs) trainer.fit(model) trainer.test()
def build_callbacks(config): callback_list = [] if config.TRAIN.CALLBACKS.LEARNING_RATE_MONITOR.ENABLE: callback_list.append( callbacks.LearningRateMonitor( logging_interval = config.TRAIN.CALLBACKS.LEARNING_RATE_MONITOR.LOGGING_INTERVAL ) ) if config.TRAIN.CALLBACKS.MODEL_CHECKPOINT.ENABLE: callback_list.append( callbacks.ModelCheckpoint( dirpath = config.OUTPUT, filename = config.TRAIN.CALLBACKS.MODEL_CHECKPOINT.FILE_NAME, monitor = config.TRAIN.CALLBACKS.MODEL_CHECKPOINT.MONITOR, save_top_k = config.TRAIN.CALLBACKS.MODEL_CHECKPOINT.SAVE_TOP_K, mode = config.TRAIN.CALLBACKS.MODEL_CHECKPOINT.MODE ) ) # if config.TRAIN.CALLBACKS.INTERVAL_STEP_VALIDATE.ENABLE: # callback_list.append( # IntervalStepValidate(config) # ) return callback_list # Run validation on specified steps # class IntervalStepValidate(Callback): # def __init__(self, config): # self.config = config # self.total_steps = config.TRAIN.STEPS # self.validation_interval = config.TRAIN.CALLBACKS.INTERVAL_STEP_VALIDATE.INTERVAL # def on_batch_end(self, trainer, pl_module): # if self.total_steps % self.validation_interval == 0: # trainer.validate_step()
def test_top_k(save_mock, tmpdir, k: int, epochs: int, val_check_interval: float, expected: int): class TestModel(BoringModel): def __init__(self): super().__init__() self.last_coeff = 10.0 def training_step(self, batch, batch_idx): loss = self.step(torch.ones(32)) loss = loss / (loss + 0.0000001) loss += self.last_coeff self.log('my_loss', loss) self.last_coeff *= 0.999 return loss model = TestModel() trainer = Trainer(callbacks=[ callbacks.ModelCheckpoint(dirpath=tmpdir, monitor='my_loss', save_top_k=k) ], default_root_dir=tmpdir, max_epochs=epochs, weights_summary=None, val_check_interval=val_check_interval) trainer.fit(model) # make sure types are correct assert save_mock.call_count == expected
def test_top_k(save_mock, tmpdir, k: int, epochs: int, val_check_interval: float, expected: int, save_last: bool): class TestModel(BoringModel): def __init__(self): super().__init__() self.last_coeff = 10.0 def training_step(self, batch, batch_idx): loss = self.step(torch.ones(32)) loss = loss / (loss + 0.0000001) loss += self.last_coeff self.log("my_loss", loss) self.last_coeff *= 0.999 return loss model = TestModel() trainer = Trainer( callbacks=[ callbacks.ModelCheckpoint(dirpath=tmpdir, monitor="my_loss", save_top_k=k, save_last=save_last) ], default_root_dir=tmpdir, max_epochs=epochs, weights_summary=None, val_check_interval=val_check_interval, ) trainer.fit(model) if save_last: # last epochs are saved every step (so double the save calls) and once `on_train_end` expected = expected * 2 + 1 assert save_mock.call_count == expected
def run(config="config/base.yml"): config = util.load_config(config) now = datetime.datetime.now().strftime('%Y%m%d-%H%M%S') run_dir = path.join("wandb", now) run_dir = path.abspath(run_dir) os.environ['WANDB_PROJECT'] = "linear_turing" os.environ['TOKENIZERS_PARALLELISM'] = 'true' checkpoint_callback = callbacks.ModelCheckpoint(monitor='val_loss', mode='min', save_weights_only=True, save_last=True, filename='{epoch}_{val_loss:.2f}') other_callbacks = [ pl.callbacks.LearningRateMonitor(), callbacks.EarlyStopping(monitor='val_loss', mode='min', patience=10) ] experiment = Experiment(config) trainer = pl.Trainer(logger=pl.loggers.WandbLogger(log_model=True), checkpoint_callback=checkpoint_callback, callbacks=other_callbacks, **config['trainer']) trainer.fit(experiment)
def train_center_net(train_df, oof_df): train_dataset = centernet.WheatDataset(train_df, transforms=get_train_transforms()) train_dataloader = DataLoader(train_dataset, batch_size=Config.Train.batch_size, shuffle=True, num_workers=4, drop_last=True, pin_memory=True) oof_dataset = centernet.WheatDataset(oof_df, test=True, transforms=get_valid_transforms()) oof_dataloader = DataLoader(oof_dataset, batch_size=Config.Train.batch_size, shuffle=False, num_workers=4, pin_memory=True) model = Resnest50CenterNet(conf=Config) early_stop = callbacks.EarlyStopping(monitor='val_map', patience=10, mode='max', verbose=True) checkpoint = callbacks.ModelCheckpoint(str(Config.Train.checkpoint_dir), monitor='val_map', verbose=True, mode='max', save_top_k=1) cbs = [ callbacks.LearningRateLogger() ] trainer = Trainer(gpus=1, early_stop_callback=early_stop, checkpoint_callback=checkpoint, callbacks=cbs, benchmark=True, deterministic=True, max_epochs=Config.Train.epochs) trainer.fit(model, train_dataloader=train_dataloader, val_dataloaders=oof_dataloader) valid_dataset = centernet.WheatDataset(get_data(mode='valid'), test=True, transforms=get_test_transforms()) valid_dataloader = DataLoader(valid_dataset, batch_size=Config.Train.batch_size, shuffle=False, num_workers=4, pin_memory=True) trainer.test(model, test_dataloaders=valid_dataloader)
def train_step(args, timestr='', best_ckpt=None): data_cfg = { "VIDEO_FPS": args["VIDEO_FPS"], "DATA_DIRECTORY": args["DATA_DIRECTORY"], "PRETRAIN_NUM_WORDS": args["PRETRAIN_NUM_WORDS"], "CHAR_TO_INDEX": args["CHAR_TO_INDEX"], "STEP_SIZE": args["STEP_SIZE"], "NUM_WORKERS": args["NUM_WORKERS"], "BATCH_SIZE": args["BATCH_SIZE"], "PRETRAIN": args["PRETRAIN"] } train_cfg = { "INIT_LR": args["INIT_LR"], "MOMENTUM1": args["MOMENTUM1"], "MOMENTUM2": args["MOMENTUM2"], "LR_SCHEDULER_FACTOR": args["LR_SCHEDULER_FACTOR"], "LR_SCHEDULER_WAIT": args["LR_SCHEDULER_WAIT"], "LR_SCHEDULER_THRESH": args["LR_SCHEDULER_THRESH"], "FINAL_LR": args["FINAL_LR"], } net_cfg = { "dModel": args["TX_NUM_FEATURES"], "nHeads": args["TX_ATTENTION_HEADS"], "numLayers": args["TX_NUM_LAYERS"], "peMaxLen": args["PE_MAX_LENGTH"], "fcHiddenSize": args["TX_FEEDFORWARD_DIM"], "dropout": args["TX_DROPOUT"], "numClasses": args["NUM_CLASSES"] } logger = pl_loggers.NeptuneLogger( project_name='benso/deep-avsr', experiment_name=f'video_only_curriculum', params=args, tags={'start_date': timestr} ) model_checkpoint = pl_callbacks.ModelCheckpoint( filename=args["NUM_WORDS"] + '/{epoch:02d}-{val_wer:.2f}', save_weights_only=True, save_top_k=3, monitor='val_wer', period=1 ) trainer = pl.Trainer( logger=logger, checkpoint_callback=model_checkpoint, gpus=2, auto_select_gpus=False, max_epochs=args["NUM_STEPS"], accelerator=args["ACCELERATOR"], resume_from_checkpoint=best_ckpt ) data = VideoNetDataModule(data_cfg=data_cfg) network = VideoNetPL(net_class=VideoNet, net_cfg=net_cfg, train_cfg=train_cfg) trainer.fit(model=network, datamodule=data) return model_checkpoint.best_model_path
def test_top_k_ddp(save_mock, tmpdir, k, epochs, val_check_interval, expected): class TestModel(BoringModel): def training_step(self, batch, batch_idx): local_rank = int(os.getenv("LOCAL_RANK")) self.log('my_loss', batch_idx * (1 + local_rank), on_epoch=True) return super().training_step(batch, batch_idx) def training_epoch_end(self, outputs) -> None: data = str(self.global_rank) obj = [[data], (data, ), set(data)] out = self.trainer.training_type_plugin.broadcast(obj) assert obj == [[str(self.global_rank)], (str(self.global_rank), ), set(str(self.global_rank))] assert out == [['0'], ('0', ), set('0')] model = TestModel() trainer = Trainer( callbacks=[callbacks.ModelCheckpoint(dirpath=tmpdir, monitor='my_loss_step', save_top_k=k, mode="max")], default_root_dir=tmpdir, max_epochs=epochs, weights_summary=None, val_check_interval=val_check_interval, accelerator="ddp", gpus=2, limit_train_batches=64, limit_val_batches=32, ) if os.getenv("LOCAL_RANK") == "0": with pytest.raises(UserWarning, match="The value associated to the key my_loss_epoch: [15.5, 31.0]"): trainer.fit(model) assert save_mock.call_count == expected else: trainer.fit(model)
def test_top_k_ddp(save_mock, tmpdir, k, epochs, val_check_interval, expected): class TestModel(BoringModel): def training_step(self, batch, batch_idx): local_rank = int(os.getenv("LOCAL_RANK")) self.log("my_loss", batch_idx * (1 + local_rank), on_epoch=True) return super().training_step(batch, batch_idx) def training_epoch_end(self, outputs) -> None: local_rank = int(os.getenv("LOCAL_RANK")) if self.trainer.is_global_zero: self.log("my_loss_2", (1 + local_rank), on_epoch=True, rank_zero_only=True) data = str(self.global_rank) obj = [[data], (data,), set(data)] out = self.trainer.strategy.broadcast(obj) assert obj == [[str(self.global_rank)], (str(self.global_rank),), set(str(self.global_rank))] assert out == [["0"], ("0",), set("0")] model = TestModel() trainer = Trainer( callbacks=[callbacks.ModelCheckpoint(dirpath=tmpdir, monitor="my_loss_step", save_top_k=k, mode="max")], default_root_dir=tmpdir, enable_progress_bar=False, max_epochs=epochs, enable_model_summary=False, val_check_interval=val_check_interval, strategy="ddp", gpus=2, limit_train_batches=64, limit_val_batches=32, ) trainer.fit(model) if os.getenv("LOCAL_RANK") == "0": assert save_mock.call_count == expected
def train(args, custom_callbacks=None): data_module = GwtDataModule( args.batch_size, args.num_dataset_workers, f'{args.dataset_base_path}/{args.split}/train.jsonl', f'{args.dataset_base_path}/{args.split}/validate.jsonl', f'{args.dataset_base_path}/{args.split}/test.jsonl', f'{args.dataset_base_path}/bpe_ast_vocab.txt', ) if args.invalidate_line_caches: data_module.invalidate_caches() model = GwtSectionPredictionTransformer( data_module.vocab.get_size(), data_module.vocab.get_index(data_module.vocab.PAD_TOKEN), args.max_sequence_length, args.embedding_size, args.learning_rate, args.num_attention_heads, args.num_encoder_layers, args.num_decoder_layers, args.feedforward_dimensions, args.positional_encoding_dropout, args.transformer_dropout, args.lr_warmup_steps, args.optimize_on_smoothed_loss, ) logger = loggers.TensorBoardLogger( args.tensorboard_dir, name=args.experiment_name, version=args.version, ) logger.log_hyperparams(args) checkpoint_dir = os.path.join(logger.log_dir, 'checkpoints') loss_key = 'val_loss' if not args.optimize_on_smoothed_loss else 'label_smoothed_val_loss' trainer = pl.Trainer.from_argparse_args( args, resume_from_checkpoint=load_checkpoint_if_available(checkpoint_dir), logger=logger, checkpoint_callback=callbacks.ModelCheckpoint( filepath=f'{checkpoint_dir}/{{epoch}}-{{{loss_key}}}', save_top_k=5, monitor=loss_key, mode='min', ), **({ 'callbacks': custom_callbacks } if custom_callbacks else {}), ) trainer.fit(model, data_module) return trainer
def main(): # parse the arguments args = config.parse_args() if args.ngpu == 0: args.device = 'cpu' pl.seed_everything(args.manual_seed) callbacks = [cbs.RichProgressBar()] if args.save_results: logger = TensorBoardLogger(save_dir=args.logs_dir, log_graph=True, name=args.project_name) checkpoint = cbs.ModelCheckpoint( dirpath=os.path.join(args.save_dir, args.project_name), filename=args.project_name + '-{epoch:03d}-{val_loss:.3f}', monitor='val_loss', save_top_k=args.checkpoint_max_history, save_weights_only=True) enable_checkpointing = True callbacks.append(checkpoint) else: logger = False checkpoint = None enable_checkpointing = False if args.swa: callbacks.append(cbs.StochasticWeightAveraging()) dataloader = getattr(datasets, args.dataset)(args) model = Model(args, dataloader) if args.ngpu == 0: strategy = None sync_batchnorm = False elif args.ngpu > 1: strategy = 'ddp' sync_batchnorm = True else: strategy = 'dp' sync_batchnorm = False trainer = pl.Trainer(gpus=args.ngpu, strategy=strategy, sync_batchnorm=sync_batchnorm, benchmark=True, callbacks=callbacks, enable_checkpointing=enable_checkpointing, logger=logger, min_epochs=1, max_epochs=args.nepochs, precision=args.precision) trainer.fit(model) trainer.predict(model, test_dataloaders=dataloader.test_dataloader())
def test_monitor_val_epoch_end(tmpdir): epoch_min_loss_override = 0 model = SimpleModule() checkpoint_callback = callbacks.ModelCheckpoint(dirpath=tmpdir, save_top_k=1, monitor="avg_val_loss") trainer = Trainer( max_epochs=epoch_min_loss_override + 2, logger=False, checkpoint_callback=checkpoint_callback, ) trainer.fit(model)
def train(args): model = RNN() data_module = DataModule(args) callbacks_list = None if args.val_path: callbacks_list = [] callbacks_list.append(callbacks.EarlyStopping(monitor='val_acc', patience=PATIENCE)) callbacks_list.append(callbacks.ModelCheckpoint(filepath=args.out_path, monitor='val_acc', prefix='rnn')) gpus = N_GPU if torch.cuda.is_available() else None trainer = pl.Trainer(gpus=gpus, max_epochs=MAX_EPOCHS, callbacks=callbacks_list) trainer.fit(model, datamodule=data_module)
def get_callbacks(cfg, output_dir): cbacks = [] checkpoint_path = os.path.join(output_dir, cfg.checkpoint.name) checkpoint = pl_callbacks.ModelCheckpoint(filepath=checkpoint_path, save_last=False, monitor=cfg.checkpoint.monitor, mode=cfg.checkpoint.monitor_mode) cs = [ pl.callbacks.EarlyStopping(monitor=cfg.checkpoint.monitor, mode=cfg.checkpoint.monitor_mode, **cfg.early_stopping), pl.callbacks.LearningRateMonitor(), ] return checkpoint, cs
def get_callbacks(cfg, output_dir): cbacks = [] checkpoint_path = os.path.join(output_dir, cfg.CHECKPOINT.NAME) checkpoint = pl_callbacks.ModelCheckpoint(filepath=checkpoint_path, save_last=False, monitor=cfg.CHECKPOINT.MONITOR, mode=cfg.CHECKPOINT.MONITOR_MODE) cs = [ pl_callbacks.EarlyStopping(monitor=cfg.CHECKPOINT.MONITOR, mode=cfg.CHECKPOINT.MONITOR_MODE, **cfg.EARLY_STOPPING), pl_callbacks.LearningRateLogger(), inspector.AnalysisCallback() ] return checkpoint, cs
def load_callbacks(): callbacks = [] callbacks.append( plc.EarlyStopping(monitor='val_acc', mode='max', patience=10, min_delta=0.001)) callbacks.append( plc.ModelCheckpoint(monitor='val_acc', filename='best-{epoch:02d}-{val_acc:.3f}', save_top_k=1, mode='max', save_last=True)) if args.lr_scheduler: callbacks.append(plc.LearningRateMonitor(logging_interval='epoch')) return callbacks
def get_loggers_callbacks(args, model=None): try: # Setup logger(s) params csv_logger_params = dict( save_dir="./experiments", name=os.path.join(*args.save_dir.split("/")[1:-1]), version=args.save_dir.split("/")[-1], ) wandb_logger_params = dict( log_model=False, name=os.path.join(*args.save_dir.split("/")[1:]), offline=args.debug, project="utime", save_dir=args.save_dir, ) loggers = [ pl_loggers.CSVLogger(**csv_logger_params), pl_loggers.WandbLogger(**wandb_logger_params), ] if model: loggers[-1].watch(model) # Setup callback(s) params checkpoint_monitor_params = dict( filepath=os.path.join(args.save_dir, "{epoch:03d}-{eval_loss:.2f}"), monitor=args.checkpoint_monitor, save_last=True, save_top_k=1, ) earlystopping_parameters = dict( monitor=args.earlystopping_monitor, patience=args.earlystopping_patience, ) callbacks = [ pl_callbacks.ModelCheckpoint(**checkpoint_monitor_params), pl_callbacks.EarlyStopping(**earlystopping_parameters), pl_callbacks.LearningRateMonitor(), ] return loggers, callbacks except AttributeError: return None, None
def get_checkpoint_callback(dirpath, monitor='train/loss', mode='min', filename="{epoch}", save_last=True, save_top_k=2, every_n_train_steps=None): ckpt_callback = plc.ModelCheckpoint( dirpath=dirpath, filename=filename, # ckpt_name + "_{epoch}", monitor=monitor, save_last=save_last, save_top_k=save_top_k, mode=mode, every_n_train_steps=every_n_train_steps, # verbose=True, ) ckpt_callback.CHECKPOINT_NAME_LAST = "{epoch}_last" return ckpt_callback
def main(): logger.remove() logger.add(sys.stdout, colorize=True, format="<green>{time:YYYY-MM-DD HH:mm:ss.SSS}</green> " + "| <level>{level}</level> " + "| <light-black>{file.path}:{line}</light-black> | {message}") hparams = parse_args() if hparams.restore: wandb.init(project=hparams.project, tags=hparams.tags) model = LevelClassification.load_from_checkpoint(hparams.restore) logger.info("Restored model") else: # wandb.init is called in LevelClassification model = LevelClassification(hparams) experiment_logger = loggers.WandbLogger(project=hparams.project, tags=hparams.tags) hparams.checkpoint_dir = os.path.join(experiment_logger.experiment.dir, "checkpoints") checkpoint_cb = callbacks.ModelCheckpoint(hparams.checkpoint_dir, save_top_k=1) trainer = pl.Trainer(logger=experiment_logger, gpus=1 if hparams.device == "cuda" else 0, checkpoint_callback=checkpoint_cb, callbacks=[EmbeddingsCallback()], early_stop_callback=callbacks.EarlyStopping(), fast_dev_run=hparams.debug) trainer.fit(model) model.freeze() baseline_datasets = [] logger.info("Baselines {}", os.listdir(hparams.baseline_level_dir)) for i, baseline_level_dir in enumerate( sorted(os.listdir(hparams.baseline_level_dir))): baseline_dataset = LevelSnippetDataset( level_dir=os.path.join(os.getcwd(), hparams.baseline_level_dir, baseline_level_dir), slice_width=model.dataset.slice_width, token_list=model.dataset.token_list) baseline_datasets.append(baseline_dataset) visualize_embeddings(model.dataset, model, "test", hparams, None, baseline_datasets)
def run(config): if isinstance(config, str): with open(config) as f: config = yaml.safe_load(f) now = datetime.datetime.now().strftime('%Y%m%d-%H%M%S') run_dir = path.join("wandb", now) run_dir = path.abspath(run_dir) os.environ['WANDB_RUN_DIR'] = run_dir checkpoint_callback = callbacks.ModelCheckpoint( run_dir, monitor=config['early_stopping']['monitor']) early_stopping_callback = callbacks.EarlyStopping( **config['early_stopping']) experiment = Experiment(config) trainer = pl.Trainer(logger=False, checkpoint_callback=checkpoint_callback, early_stop_callback=early_stopping_callback, **config['trainer']) trainer.fit(experiment)
def main(cfg: DictConfig): model = pixellstm_pl(cfg) data = MNISTDataModule(cfg) logger = pl_loggers.TensorBoardLogger(save_dir=cfg.train.log_dir, version=cfg.train.version) checkpoint_callback = callbacks.ModelCheckpoint( monitor='val_loss', dirpath=cfg.train.checkpoint_dir, save_top_k=cfg.train.save_top_k) trainer = Trainer( accelerator=None if platform.system() == 'Windows' else 'ddp', accumulate_grad_batches=cfg.train.accumulate, auto_scale_batch_size=True, max_epochs=cfg.train.epochs, callbacks=[checkpoint_callback], default_root_dir=cfg.train.log_dir, fast_dev_run=cfg.train.fast_dev_run, gpus=cfg.train.gpus, logger=logger, terminate_on_nan=True, weights_save_path=cfg.train.checkpoint_dir, check_val_every_n_epoch=cfg.train.check_val_freq) trainer.fit(model, datamodule=data)
def train_faster_rcnn(train_df, oof_df): train_dataset = rcnn.WheatDataset(train_df, transforms=get_train_transforms()) train_dataloader = DataLoader(train_dataset, batch_size=Config.Train.batch_size, shuffle=True, num_workers=4, drop_last=True, collate_fn=collate_fn, pin_memory=True) oof_dataset = rcnn.WheatDataset(oof_df, test=True, transforms=get_valid_transforms()) oof_dataloader = DataLoader(oof_dataset, batch_size=Config.Train.batch_size, shuffle=False, num_workers=4, collate_fn=collate_fn, pin_memory=True) # model = FasterRCNNResnet50FPN.load_from_checkpoint('checkpoints\\faster_rcnn\\epoch=9.ckpt', **Config) model = FasterRCNNResnet50FPN(conf=Config) early_stop = callbacks.EarlyStopping(monitor='val_loss', patience=20, mode='min', verbose=True) checkpoint = callbacks.ModelCheckpoint(str(Config.Train.checkpoint_dir), monitor='val_loss', verbose=True, save_top_k=1) cbs = [ callbacks.LearningRateLogger() ] trainer = Trainer(gpus=1, early_stop_callback=early_stop, checkpoint_callback=checkpoint, callbacks=cbs, benchmark=True, deterministic=True, max_epochs=Config.Train.epochs) trainer.fit(model, train_dataloader=train_dataloader, val_dataloaders=oof_dataloader) valid_dataset = rcnn.WheatDataset(get_data(mode='valid'), test=True, transforms=get_test_transforms()) valid_dataloader = DataLoader(valid_dataset, batch_size=Config.Train.batch_size, shuffle=False, num_workers=4, collate_fn=collate_fn, pin_memory=True) trainer.test(model, test_dataloaders=valid_dataloader)
def main(cfg): model = e2vid(cfg) data = e2tensor_datamodule(cfg, hopath(cfg.dataset.dir)) logger = pl_loggers.TensorBoardLogger(save_dir=cfg.train.log_dir, version=cfg.train.version) checkpoint_callback = callbacks.ModelCheckpoint( monitor="val_loss", dirpath=cfg.train.checkpoint_dir, save_top_k=cfg.train.save_top_k, ) trainer = Trainer( accelerator=None if platform.system() == "Windows" else "ddp", auto_scale_batch_size=True, max_epochs=cfg.train.epochs, callbacks=[checkpoint_callback], default_root_dir=cfg.train.log_dir, fast_dev_run=True if cfg.runtype == "debug" else False, gpus=cfg.train.gpus, logger=logger, terminate_on_nan=True, weights_save_path=cfg.train.checkpoint_dir, check_val_every_n_epoch=cfg.train.check_val_freq, ) trainer.fit(model, datamodule=data)
def main(): args = parse_args() if args.debug or not args.non_deterministic: np.random.seed(1) torch.manual_seed(1) torch.cuda.manual_seed(1) torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True # torch.set_deterministic(True) # grid_sampler_2d_backward_cuda does not have a deterministic implementation if args.debug: torch.autograd.set_detect_anomaly(True) dataloader_args = EasyDict( batch_size=args.batch_size, shuffle=False, num_workers=0 if args.debug else args.data_workers) if args.dataset == 'mnist': args.num_classes = 10 args.im_channels = 1 args.image_size = (40, 40) from torchvision.datasets import MNIST t = transforms.Compose([ transforms.RandomCrop(size=(40, 40), pad_if_needed=True), transforms.ToTensor(), # norm_1c ]) train_dataloader = DataLoader( MNIST(data_path / 'mnist', train=True, transform=t, download=True), **dataloader_args) val_dataloader = DataLoader( MNIST(data_path / 'mnist', train=False, transform=t, download=True), **dataloader_args) elif args.dataset == 'usps': args.num_classes = 10 args.im_channels = 1 args.image_size = (40, 40) from torchvision.datasets import USPS t = transforms.Compose([ transforms.RandomCrop(size=(40, 40), pad_if_needed=True), transforms.ToTensor(), # norm_1c ]) train_dataloader = DataLoader( USPS(data_path / 'usps', train=True, transform=t, download=True), **dataloader_args) val_dataloader = DataLoader( USPS(data_path / 'usps', train=False, transform=t, download=True), **dataloader_args) elif args.dataset == 'constellation': data_gen = create_constellation( batch_size=args.batch_size, shuffle_corners=True, gaussian_noise=.0, drop_prob=0.5, which_patterns=[[0], [1], [0]], rotation_percent=180 / 360., max_scale=3., min_scale=3., use_scale_schedule=False, schedule_steps=0, ) train_dataloader = DataLoader(data_gen, **dataloader_args) val_dataloader = DataLoader(data_gen, **dataloader_args) elif args.dataset == 'cifar10': args.num_classes = 10 args.im_channels = 3 args.image_size = (32, 32) from torchvision.datasets import CIFAR10 t = transforms.Compose([transforms.ToTensor()]) train_dataloader = DataLoader( CIFAR10(data_path / 'cifar10', train=True, transform=t, download=True), **dataloader_args) val_dataloader = DataLoader( CIFAR10(data_path / 'cifar10', train=False, transform=t, download=True), **dataloader_args) elif args.dataset == 'svhn': args.num_classes = 10 args.im_channels = 3 args.image_size = (32, 32) from torchvision.datasets import SVHN t = transforms.Compose([transforms.ToTensor()]) train_dataloader = DataLoader( SVHN(data_path / 'svhn', split='train', transform=t, download=True), **dataloader_args) val_dataloader = DataLoader( SVHN(data_path / 'svhn', split='test', transform=t, download=True), **dataloader_args) else: raise NotImplementedError() logger = WandbLogger(project=args.log.project, name=args.log.run_name, entity=args.log.team, config=args, offline=not args.log.upload) if args.model == 'ccae': from scae.modules.attention import SetTransformer from scae.modules.capsule import CapsuleLayer from scae.models.ccae import CCAE encoder = SetTransformer(2) decoder = CapsuleLayer(input_dims=32, n_caps=3, n_caps_dims=2, n_votes=4, n_caps_params=32, n_hiddens=128, learn_vote_scale=True, deformations=True, noise_type='uniform', noise_scale=4., similarity_transform=False) model = CCAE(encoder, decoder, args) # logger.watch(encoder._encoder, log='all', log_freq=args.log_frequency) # logger.watch(decoder, log='all', log_freq=args.log_frequency) elif args.model == 'pcae': from scae.modules.part_capsule_ae import CapsuleImageEncoder, TemplateImageDecoder from scae.models.pcae import PCAE encoder = CapsuleImageEncoder(args) decoder = TemplateImageDecoder(args) model = PCAE(encoder, decoder, args) logger.watch(encoder._encoder, log='all', log_freq=args.log.frequency) logger.watch(decoder, log='all', log_freq=args.log.frequency) elif args.model == 'ocae': from scae.modules.object_capsule_ae import SetTransformer, ImageCapsule from scae.models.ocae import OCAE encoder = SetTransformer() decoder = ImageCapsule() model = OCAE(encoder, decoder, args) # TODO: after ccae else: raise NotImplementedError() # Execute Experiment lr_logger = cb.LearningRateMonitor(logging_interval='step') best_checkpointer = cb.ModelCheckpoint(save_top_k=1, monitor='val_rec_ll', filepath=logger.experiment.dir) last_checkpointer = cb.ModelCheckpoint(save_last=True, filepath=logger.experiment.dir) trainer = pl.Trainer( max_epochs=args.num_epochs, logger=logger, callbacks=[lr_logger, best_checkpointer, last_checkpointer]) trainer.fit(model, train_dataloader, val_dataloader)
def test_eval_logging_auto_reduce(tmpdir): """ Tests that only training_step can be used """ seed_everything(1234) os.environ['PL_DEV_DEBUG'] = '1' class TestModel(BoringModel): def on_pretrain_routine_end(self) -> None: self.seen_vals = [] self.manual_epoch_end_mean = None def on_validation_epoch_start(self) -> None: self.seen_vals = [] def validation_step(self, batch, batch_idx): output = self.layer(batch) loss = self.loss(batch, output) self.seen_vals.append(loss) self.log('val_loss', loss, on_epoch=True, on_step=True, prog_bar=True) return {"x": loss} def validation_epoch_end(self, outputs) -> None: for passed_in, manually_tracked in zip(outputs, self.seen_vals): assert passed_in['x'] == manually_tracked self.manual_epoch_end_mean = torch.stack([x['x'] for x in outputs ]).mean() model = TestModel() trainer = Trainer( default_root_dir=tmpdir, limit_train_batches=3, limit_val_batches=3, max_epochs=1, log_every_n_steps=1, weights_summary=None, checkpoint_callback=callbacks.ModelCheckpoint(dirpath='val_loss')) trainer.fit(model) # make sure all the metrics are available for callbacks manual_mean = model.manual_epoch_end_mean callback_metrics = set(trainer.callback_metrics.keys()) assert callback_metrics == {'debug_epoch', 'val_loss', 'val_loss_epoch'} # make sure values are correct assert trainer.logged_metrics['val_loss_epoch'] == manual_mean assert trainer.callback_metrics['val_loss'] == trainer.logged_metrics[ 'val_loss_step/epoch_0'] # make sure correct values were logged logged_val = trainer.dev_debugger.logged_metrics # sanity check assert logged_val[0]['global_step'] == 0 assert logged_val[1]['global_step'] == 0 # 3 val batches assert logged_val[2]['val_loss_step/epoch_0'] == model.seen_vals[0] assert logged_val[3]['val_loss_step/epoch_0'] == model.seen_vals[1] assert logged_val[4]['val_loss_step/epoch_0'] == model.seen_vals[2] # epoch mean assert logged_val[5]['val_loss_epoch'] == model.manual_epoch_end_mean # only those logged assert len(logged_val) == 6
collate_fn=_collate_fn) test_dataset = LanguageModelingDataset(datasets['test']) test_batch_sampler = BPTTBatchSampler(test_dataset, hparams.bptt, hparams.batch_size) test_data = DataLoader(test_dataset, num_workers=8, pin_memory=True, batch_sampler=test_batch_sampler, collate_fn=_collate_fn) early_stop_callback = callbacks.EarlyStopping(monitor='val_ppl', mode='min') model_checkpoint_callback = callbacks.ModelCheckpoint( monitor='val_ppl', save_last=True, save_top_k=5, save_weights_only=False, mode='min') trainer = Trainer.from_argparse_args( hparams, default_root_dir=os.path.abspath( os.path.expanduser("~/data/awd-lstm")), callbacks=[ early_stop_callback, model_checkpoint_callback, NNICallback() ]) del hparams.tpu_cores model = LanguageModel(hparams)
def test__training_step__log(tmpdir): """ Tests that only training_step can be used """ class TestModel(DeterministicModel): def training_step(self, batch, batch_idx): acc = self.step(batch, batch_idx) acc = acc + batch_idx # ----------- # default # ----------- self.log('default', acc) # ----------- # logger # ----------- # on_step T on_epoch F self.log('l_s', acc, on_step=True, on_epoch=False, prog_bar=False, logger=True) # on_step F on_epoch T self.log('l_e', acc, on_step=False, on_epoch=True, prog_bar=False, logger=True) # on_step T on_epoch T self.log('l_se', acc, on_step=True, on_epoch=True, prog_bar=False, logger=True) # ----------- # pbar # ----------- # on_step T on_epoch F self.log('p_s', acc, on_step=True, on_epoch=False, prog_bar=True, logger=False) # on_step F on_epoch T self.log('p_e', acc, on_step=False, on_epoch=True, prog_bar=True, logger=False) # on_step T on_epoch T self.log('p_se', acc, on_step=True, on_epoch=True, prog_bar=True, logger=False) self.training_step_called = True return acc def backward(self, loss, optimizer, optimizer_idx): return LightningModule.backward(self, loss, optimizer, optimizer_idx) model = TestModel() model.val_dataloader = None trainer = Trainer( default_root_dir=tmpdir, limit_train_batches=2, limit_val_batches=2, max_epochs=2, log_every_n_steps=1, weights_summary=None, checkpoint_callback=callbacks.ModelCheckpoint(monitor='l_se')) trainer.fit(model) # make sure correct steps were called assert model.training_step_called assert not model.training_step_end_called assert not model.training_epoch_end_called # make sure all the metrics are available for callbacks logged_metrics = set(trainer.logged_metrics.keys()) expected_logged_metrics = { 'epoch', 'default', 'l_e', 'l_s', 'l_se_step', 'l_se_epoch', } assert logged_metrics == expected_logged_metrics pbar_metrics = set(trainer.progress_bar_metrics.keys()) expected_pbar_metrics = { 'p_e', 'p_s', 'p_se_step', 'p_se_epoch', } assert pbar_metrics == expected_pbar_metrics callback_metrics = set(trainer.callback_metrics.keys()) callback_metrics.remove('debug_epoch') expected_callback_metrics = set() expected_callback_metrics = expected_callback_metrics.union(logged_metrics) expected_callback_metrics = expected_callback_metrics.union(pbar_metrics) expected_callback_metrics.update({'p_se', 'l_se'}) expected_callback_metrics.remove('epoch') assert callback_metrics == expected_callback_metrics
solver = FewshotSolver(classifier) if classifier_name.startswith("reg"): solver.weight_decay = 1e-4 else: solver.weight_decay = 0 tb_logger = pllog.TensorBoardLogger("logs_byte/" + classifier_name) trainer = pl.Trainer( logger=tb_logger, gpus=1, max_epochs=10, log_every_n_steps=1, precision=32, check_val_every_n_epoch=1, auto_lr_find=True, callbacks=[ FewshotDatasetReplacement(datasets, every_batch=20), plcb.ModelCheckpoint() ]) trainer.tune(solver, train_dataloader=datasets.train_dataloader()) trainer.fit( solver, train_dataloader=datasets.train_dataloader(), val_dataloaders=[ datasets.val_dataloader(seen=False, unseen=True), datasets.val_dataloader(seen=True, unseen=False), ] )
def train(config: Config): """学習処理の実行スクリプト.""" pl.seed_everything(config.random_seed) # 学習を途中から再開する場合などの設定 cache_dir = pathlib.Path(config.cache_dir) cache_dir.mkdir(exist_ok=True) trainer_params = dict() lastckpt = cache_dir.joinpath("last.ckpt") if config.resume: trainer_params["resume_from_checkpoint"] = str(lastckpt) elif lastckpt.exists(): lastckpt.unlink() for filepath in cache_dir.glob("epoch*.ckpt"): filepath.unlink() # ログ設定 pl_logger = pl_loggers.MLFlowLogger( experiment_name=config.experiment_name, tracking_uri=os.environ.get("MLFLOW_TRACKING_URI", None), tags={ "mlflow.source.name": pathlib.Path(__file__).name, "mlflow.source.git.commit": ut.get_commit_id(), }, ) # ネットワーク、データセットの取得及び学習 network = tv_models.vgg16(pretrained=False) params = dc.asdict(config) model = Trainer(network, **params) callbacks: t.List[t.Any] = list() model_checkpoint = pl_callbacks.ModelCheckpoint( filepath=str(cache_dir), monitor="val_loss", save_last=True, save_top_k=config.save_top_k, save_weights_only=config.save_weights_only, mode="min", period=1, ) callbacks.append(model_checkpoint) if config.early_stop: callbacks.append( pl_callbacks.EarlyStopping( monitor="val_loss", min_delta=0.0, patience=3, verbose=False, mode="auto", )) pl_trainer = pl.Trainer( default_root_dir=str(cache_dir), fast_dev_run=False, min_epochs=config.min_epochs, max_epochs=config.max_epochs, gpus=[0] if config.use_gpu and cuda.is_available() else None, progress_bar_refresh_rate=config.progress_bar_refresh_rate, profiler=config.profiler, callbacks=callbacks, logger=pl_logger, log_gpu_memory=True, **trainer_params, ) datamodule = dataset_food101.Food101WithLableModule( batch_size=config.batch_size, num_workers=config.num_workers, ) pl_trainer.fit(model, datamodule) # ログに追加情報を設定 mlf_client = mlflow.tracking.MlflowClient() for ckptfile in cache_dir.glob("epoch*.ckpt"): model = model.load_from_checkpoint(str(ckptfile), network, **params) with tempfile.TemporaryDirectory() as dname: mlf_model_path = pathlib.Path(dname).joinpath(ckptfile.stem) mlf_pytorch.save_model(model.network, mlf_model_path) mlf_client.log_artifact(pl_logger.run_id, mlf_model_path)
from pytorch_lightning import Trainer, callbacks from src.main.config import cfg_ds_v1, cfg_train from src.main.feeder import NtuFeeder from src.main.model import KhoiddNet from torch.utils.data import DataLoader if __name__ == "__main__": trainer: Trainer = Trainer( gpus=-1, # -1: train on all gpus use_amp=True, max_epochs=200, # callback checkpoint_callback=callbacks.ModelCheckpoint( filepath=cfg_train.output_train + "/model", save_best_only=True, monitor="val_loss", mode="min", ), # only use when debug fast_dev_run=False, show_progress_bar=True, train_percent_check=1.0, # percent of train data val_percent_check=1.0, # percent of val data check_val_every_n_epoch=1, # epoch per val val_check_interval=1.0, # val per epoch ) trainer.fit( model=KhoiddNet(), train_dataloader=DataLoader( dataset=NtuFeeder( path_data=cfg_ds_v1.path_data_preprocess +