def main(conf): train_set = PodcastMixDataloader( csv_dir=conf["data"]["train_dir"], sample_rate=conf["data"]["sample_rate"], original_sample_rate=conf["data"]["original_sample_rate"], segment=conf["data"]["segment"], shuffle_tracks=True, multi_speakers=conf["training"]["multi_speakers"]) val_set = PodcastMixDataloader( csv_dir=conf["data"]["valid_dir"], sample_rate=conf["data"]["sample_rate"], original_sample_rate=conf["data"]["original_sample_rate"], segment=conf["data"]["segment"], shuffle_tracks=True, multi_speakers=conf["training"]["multi_speakers"]) train_loader = DataLoader(train_set, shuffle=True, batch_size=conf["training"]["batch_size"], num_workers=conf["training"]["num_workers"], drop_last=True, pin_memory=True) val_loader = DataLoader(val_set, shuffle=False, batch_size=conf["training"]["batch_size"], num_workers=conf["training"]["num_workers"], drop_last=True, pin_memory=True) if (conf["model"]["name"] == "ConvTasNet"): sys.path.append('ConvTasNet_model') from conv_tasnet_norm import ConvTasNetNorm conf["masknet"].update({"n_src": conf["data"]["n_src"]}) model = ConvTasNetNorm(**conf["filterbank"], **conf["masknet"], sample_rate=conf["data"]["sample_rate"]) loss_func = LogL2Time() plugins = None elif (conf["model"]["name"] == "UNet"): # UNet with logl2 time loss and normalization inside model sys.path.append('UNet_model') from unet_model import UNet model = UNet(conf["data"]["sample_rate"], conf["data"]["fft_size"], conf["data"]["hop_size"], conf["data"]["window_size"], conf["convolution"]["kernel_size"], conf["convolution"]["stride"]) loss_func = LogL2Time() plugins = DDPPlugin(find_unused_parameters=False) optimizer = make_optimizer(model.parameters(), **conf["optim"]) if conf["training"]["half_lr"]: scheduler = ReduceLROnPlateau(optimizer=optimizer, factor=0.5, patience=5) # Just after instantiating, save the args. Easy loading in the future. exp_dir = conf["model"]["name"] + "_model/" + conf["main_args"]["exp_dir"] os.makedirs(exp_dir, exist_ok=True) conf_path = os.path.join(exp_dir, "conf.yml") with open(conf_path, "w") as outfile: yaml.safe_dump(conf, outfile) system = System(model=model, loss_func=loss_func, optimizer=optimizer, train_loader=train_loader, val_loader=val_loader, scheduler=scheduler, config=conf) # Define callbacks callbacks = [] checkpoint_dir = os.path.join(exp_dir, "checkpoints/") checkpoint = ModelCheckpoint(checkpoint_dir, monitor="val_loss", mode="min", save_top_k=5, verbose=True) callbacks.append(checkpoint) if conf["training"]["early_stop"]: callbacks.append( EarlyStopping(monitor="val_loss", mode="min", patience=100, verbose=True)) # Don't ask GPU if they are not available. gpus = -1 if torch.cuda.is_available() else None distributed_backend = "ddp" if torch.cuda.is_available() else None trainer = pl.Trainer( max_epochs=conf["training"]["epochs"], callbacks=callbacks, default_root_dir=exp_dir, gpus=gpus, distributed_backend=distributed_backend, gradient_clip_val=5.0, resume_from_checkpoint=conf["main_args"]["resume_from"], precision=32, plugins=plugins) trainer.fit(system) best_k = {k: v.item() for k, v in checkpoint.best_k_models.items()} with open(os.path.join(exp_dir, "best_k_models.json"), "w") as f: print(best_k, f) json.dump(best_k, f, indent=0) print(checkpoint.best_model_path) state_dict = torch.load(checkpoint.best_model_path) system.load_state_dict(state_dict=state_dict["state_dict"]) system.cpu() to_save = system.model.serialize() to_save.update(train_set.get_infos()) torch.save(to_save, os.path.join(exp_dir, "best_model.pth"))
def __call__(self, trial): torch.cuda.empty_cache() trial.set_user_attr('fold', self.fold) kwargs = self.suggestions(trial) start = datetime.now() print(f"Training on fold {self.fold}") train_ds, val_ds, input_cols, cond_cols = read(self.path, self.exp, self.fold, self.subset) cols = list(np.concatenate((input_cols, cond_cols, [self.target]))) train = train_ds.to_table(columns=cols).to_pandas() val = val_ds.to_table(columns=cols).to_pandas() # DataModule dm = CTRPDataModule(train, val, input_cols, cond_cols, self.target, kwargs['batch_size']) del train, val print(f"Time elapsed loading data: {datetime.now()-start}") # Model model = ConditionalNetwork( n_blocks=kwargs['n_blocks'], exp=self.exp, inputs_sz=len(dm.input_cols), conds_sz=len(dm.cond_cols), inputs_emb_layers=kwargs['inputs_emb_layers'], conds_emb_layers=kwargs['conds_emb_layers'], film_layers=kwargs['film_layers'], linear_layers=kwargs['linear_layers'], ps_emb=kwargs['ps_emb'], ps_film=kwargs['ps_film'], ps_linear=kwargs['ps_linear'], learning_rate=kwargs['learning_rate'], weight_decay=kwargs['weight_decay'], batch_size=kwargs['batch_size']) # Callbacks logger = TensorBoardLogger( save_dir=self.logs, version=f"trial{trial.number}_{self.exp}_fold_{self.fold}", name='model_logs') early_stop = EarlyStopping(monitor='val_r2', min_delta=0.0001, patience=12, verbose=False, mode='max') # Trainer trainer = Trainer( default_root_dir=logger. log_dir, #in order to avoid lr_find_temp.ckpt conflicts auto_lr_find=False, auto_scale_batch_size=False, max_epochs=self.epochs, gpus=self.gpu, accelerator=self.accelerator, logger=logger, callbacks=[ PyTorchLightningPruningCallback(trial, monitor="val_r2"), early_stop ], flush_logs_every_n_steps=200, precision=32, profiler="simple", deterministic=True) trainer.fit(model, dm) # save and clean up gpu r2 = trainer.callback_metrics["val_r2"].item() del dm, model, trainer torch.cuda.empty_cache() print("Completed fold {} in {}".format(self.fold, str(datetime.now() - start))) print(f'Fold val_r2: {r2}') return r2
def main(hparams): """ Trains the Lightning model as specified in `hparams` """ # in order to make sure every model in multi-GPU have the same weight seed = 1234567 torch.manual_seed(seed) if torch.cuda.is_available(): torch.cuda.manual_seed(seed) torch.cuda.manual_seed_all(seed) # if you are using multi-GPU. np.random.seed(seed) # Numpy module. random.seed(seed) # Python random module. torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True model = Lightning_Unet(hparams) if COMPUTECANADA: cur_path = Path(__file__).resolve().parent default_root_dir = cur_path checkpoint_file = Path( __file__).resolve().parent / "checkpoint/{epoch}-{val_dice:.5f}" if not os.path.exists(Path(__file__).resolve().parent / "checkpoint"): os.mkdir(Path(__file__).resolve().parent / "checkpoint") else: default_root_dir = "./log" if not os.path.exists(default_root_dir): os.mkdir(default_root_dir) checkpoint_file = "./log/checkpoint" if not os.path.exists(checkpoint_file): os.mkdir(checkpoint_file) checkpoint_file = Path(checkpoint_file) / "{epoch}-{val_dice:.2f}" # After training finishes, use best_model_path to retrieve the path to the best # checkpoint file and best_model_score to retrieve its score. checkpoint_callback = ModelCheckpoint( filepath=checkpoint_file, save_top_k=3, verbose=True, # monitor='val_dice', mode='max', prefix='', save_weights_only=False, # could realize to save the checkpoint several times in one epoch period=-1, ) early_stop_callback = EarlyStopping( # monitor='val_loss', min_delta=0.00, patience=300, strict=True, verbose=False, mode='max') tb_logger = loggers.TensorBoardLogger(hparams.TensorBoardLogger) trainer = Trainer( gpus=hparams.gpus, num_nodes=hparams.nodes, distributed_backend='ddp', # the next two can be combined to use, in a straight way val_check_interval=0.5, # check_val_every_n_epoch=3, # log every k batches instead row_log_interval=10, # set the interval at which you want to log using this trainer flag. log_save_interval=10, checkpoint_callback=checkpoint_callback, early_stop_callback=early_stop_callback, callbacks=[LearningRateLogger()], # runs 1 train, val, test batch and program ends fast_dev_run=hparams.fast_dev_run, default_root_dir=default_root_dir, logger=tb_logger, max_epochs=10000, # this need to be string # resume_from_checkpoint=str(Path(__file__).resolve().parent / "checkpoint" / hparams.checkpoint_file), profiler=True, auto_lr_find=False, # simulate a larger batch size for gradient descent to provide a good estimate # accumulate_grad_batches=4, ) # if COMPUTECANADA: # pickle.dumps(model) # lr_finder = trainer.lr_find(model) # # # Plot with # fig = lr_finder.plot(suggest=True) # fig.show() # # # Pick point based on plot, or get suggestion # new_lr = lr_finder.suggestion() # print(f"recommend learning_rate: {new_lr}") # model.hparams.learning_rate = new_lr trainer.fit(model)
def test_early_stopping_mode_options(): with pytest.raises(MisconfigurationException, match="`mode` can be .* got unknown_option"): EarlyStopping(mode="unknown_option")
def main(conf): train_set = WhamDataset( conf["data"]["train_dir"], conf["data"]["task"], sample_rate=conf["data"]["sample_rate"], segment=conf["data"]["segment"], nondefault_nsrc=conf["data"]["nondefault_nsrc"], ) val_set = WhamDataset( conf["data"]["valid_dir"], conf["data"]["task"], sample_rate=conf["data"]["sample_rate"], nondefault_nsrc=conf["data"]["nondefault_nsrc"], ) train_loader = DataLoader( train_set, shuffle=True, batch_size=conf["training"]["batch_size"], num_workers=conf["training"]["num_workers"], drop_last=True, ) val_loader = DataLoader( val_set, shuffle=False, batch_size=conf["training"]["batch_size"], num_workers=conf["training"]["num_workers"], drop_last=True, ) # Update number of source values (It depends on the task) conf["masknet"].update({"n_src": train_set.n_src}) model = DPTNet(**conf["filterbank"], **conf["masknet"]) optimizer = make_optimizer(model.parameters(), **conf["optim"]) from asteroid.engine.schedulers import DPTNetScheduler schedulers = { "scheduler": DPTNetScheduler( optimizer, len(train_loader) // conf["training"]["batch_size"], 64 ), "interval": "step", } # Just after instantiating, save the args. Easy loading in the future. exp_dir = conf["main_args"]["exp_dir"] os.makedirs(exp_dir, exist_ok=True) conf_path = os.path.join(exp_dir, "conf.yml") with open(conf_path, "w") as outfile: yaml.safe_dump(conf, outfile) # Define Loss function. loss_func = PITLossWrapper(pairwise_neg_sisdr, pit_from="pw_mtx") system = System( model=model, loss_func=loss_func, optimizer=optimizer, scheduler=schedulers, train_loader=train_loader, val_loader=val_loader, config=conf, ) # Define callbacks callbacks = [] checkpoint_dir = os.path.join(exp_dir, "checkpoints/") checkpoint = ModelCheckpoint( checkpoint_dir, monitor="val_loss", mode="min", save_top_k=5, verbose=True ) callbacks.append(checkpoint) if conf["training"]["early_stop"]: callbacks.append(EarlyStopping(monitor="val_loss", mode="min", patience=30, verbose=True)) # Don't ask GPU if they are not available. gpus = -1 if torch.cuda.is_available() else None distributed_backend = "ddp" if torch.cuda.is_available() else None trainer = pl.Trainer( max_epochs=conf["training"]["epochs"], callbacks=callbacks, default_root_dir=exp_dir, gpus=gpus, distributed_backend=distributed_backend, gradient_clip_val=conf["training"]["gradient_clipping"], ) trainer.fit(system) best_k = {k: v.item() for k, v in checkpoint.best_k_models.items()} with open(os.path.join(exp_dir, "best_k_models.json"), "w") as f: json.dump(best_k, f, indent=0) state_dict = torch.load(checkpoint.best_model_path) system.load_state_dict(state_dict=state_dict["state_dict"]) system.cpu() to_save = system.model.serialize() to_save.update(train_set.get_infos()) torch.save(to_save, os.path.join(exp_dir, "best_model.pth"))
def main(conf): train_dirs = [ conf["data"]["train_dir"].format(n_src) for n_src in conf["masknet"]["n_srcs"] ] valid_dirs = [ conf["data"]["valid_dir"].format(n_src) for n_src in conf["masknet"]["n_srcs"] ] train_set = Wsj0mixVariable( json_dirs=train_dirs, n_srcs=conf["masknet"]["n_srcs"], sample_rate=conf["data"]["sample_rate"], seglen=conf["data"]["seglen"], minlen=conf["data"]["minlen"], ) val_set = Wsj0mixVariable( json_dirs=valid_dirs, n_srcs=conf["masknet"]["n_srcs"], sample_rate=conf["data"]["sample_rate"], seglen=conf["data"]["seglen"], minlen=conf["data"]["minlen"], ) train_loader = DataLoader( train_set, shuffle=True, batch_size=conf["training"]["batch_size"], num_workers=conf["training"]["num_workers"], drop_last=True, collate_fn=_collate_fn, ) val_loader = DataLoader( val_set, shuffle=False, batch_size=conf["training"]["batch_size"], num_workers=conf["training"]["num_workers"], drop_last=True, collate_fn=_collate_fn, ) model, optimizer = make_model_and_optimizer( conf, sample_rate=conf["data"]["sample_rate"]) scheduler = [] if conf["training"]["half_lr"]: scheduler.append( ReduceLROnPlateau(optimizer=optimizer, factor=0.5, patience=5)) if conf["training"]["lr_decay"]: scheduler.append(ExponentialLR(optimizer=optimizer, gamma=0.99)) exp_dir = conf["main_args"]["exp_dir"] os.makedirs(exp_dir, exist_ok=True) conf_path = os.path.join(exp_dir, "conf.yml") with open(conf_path, "w") as outfile: yaml.safe_dump(conf, outfile) loss_func = WeightedPITLoss(n_srcs=conf["masknet"]["n_srcs"], lamb=conf["loss"]["lambda"]) # Put together in System system = VarSpkrSystem( model=model, loss_func=loss_func, optimizer=optimizer, train_loader=train_loader, val_loader=val_loader, scheduler=scheduler, config=conf, ) # Define callbacks callbacks = [] checkpoint_dir = os.path.join(exp_dir, "checkpoints/") checkpoint = ModelCheckpoint( dirpath=checkpoint_dir, filename="{epoch}-{step}", monitor="avg_sdr", mode="max", save_top_k=5, verbose=True, ) callbacks.append(checkpoint) if conf["training"]["early_stop"]: callbacks.append( EarlyStopping(monitor="avg_sdr", mode="max", patience=30, verbose=True)) # Don't ask GPU if they are not available. gpus = -1 if torch.cuda.is_available() else None distributed_backend = "dp" if torch.cuda.is_available() else None # Train model trainer = pl.Trainer( max_epochs=conf["training"]["epochs"], callbacks=callbacks, default_root_dir=exp_dir, gpus=gpus, distributed_backend=distributed_backend, limit_train_batches=1.0, # Useful for fast experiment gradient_clip_val=200, resume_from_checkpoint=conf["main_args"]["resume_from"], ) trainer.fit(system) best_k = {k: v.item() for k, v in checkpoint.best_k_models.items()} with open(os.path.join(exp_dir, "best_k_models.json"), "w") as f: json.dump(best_k, f, indent=0) # Save last model for convenience torch.save(system.model.state_dict(), os.path.join(exp_dir, "final_model.pth"))
return self._epoch_end() def on_train_end(self) -> None: assert self.trainer.current_epoch == self.expected_end_epoch, 'Early Stopping Failed' _ES_CHECK = dict(check_on_train_epoch_end=True) _ES_CHECK_P3 = dict(patience=3, check_on_train_epoch_end=True) _NO_WIN = dict(marks=RunIf(skip_windows=True)) @pytest.mark.parametrize( "callbacks, expected_stop_epoch, check_on_train_epoch_end, accelerator, num_processes", [ ([EarlyStopping('abc'), EarlyStopping('cba', patience=3)], 3, False, None, 1), ([EarlyStopping('cba', patience=3), EarlyStopping('abc')], 3, False, None, 1), pytest.param([EarlyStopping('abc'), EarlyStopping('cba', patience=3)], 3, False, 'ddp_cpu', 2, **_NO_WIN), pytest.param([EarlyStopping('cba', patience=3), EarlyStopping('abc')], 3, False, 'ddp_cpu', 2, **_NO_WIN), ([EarlyStopping('abc', **_ES_CHECK), EarlyStopping('cba', **_ES_CHECK_P3)], 3, True, None, 1), ([EarlyStopping('cba', **_ES_CHECK_P3), EarlyStopping('abc', **_ES_CHECK)], 3, True, None, 1), pytest.param([EarlyStopping('abc', **_ES_CHECK), EarlyStopping('cba', **_ES_CHECK_P3)], 3, True, 'ddp_cpu', 2, **_NO_WIN), pytest.param([EarlyStopping('cba', **_ES_CHECK_P3), EarlyStopping('abc', **_ES_CHECK)], 3, True, 'ddp_cpu', 2, **_NO_WIN), ], ) def test_multiple_early_stopping_callbacks( tmpdir, callbacks: List[EarlyStopping], expected_stop_epoch: int,
def train(cfg, model, transforms, train_data, val_data): gc.collect() pl.seed_everything(cfg.seed) # init logger loggers = [] # init logger if cfg.logger == 'csv': logger = pl.loggers.CSVLogger(save_dir='logs/', name=cfg.model_type) loggers.append(logger) elif cfg.logger == 'tensorboard': logger = pl.loggers.TensorBoardLogger('tb_logs', name=cfg.model_type) loggers.append(logger) elif cfg.logger == 'wandb': logger = pl.loggers.WandbLogger(project='Plant Pathology 2021 - FGVC8') loggers.append(logger) elif cfg.logger == 'all': logger1 = pl.loggers.CSVLogger(save_dir='logs/', name=cfg.model_type) logger2 = pl.loggers.TensorBoardLogger('tb_logs', name=cfg.model_type) logger3 = pl.loggers.WandbLogger( project='Plant Pathology 2021 - FGVC8') loggers.append(*[logger1, logger2, logger3]) else: pass # init callbacks ckpt_min_loss = ModelCheckpoint( monitor='total_loss', save_top_k=cfg.ckpt_save_top_k, mode='min', save_last=cfg.ckpt_save_last, filename=os.path.join( "checkpoint", f"min-loss-fold={cfg.fold_i}" + "-{epoch}-{valid_loss:.4f}-{valid_f1:.4f}"), ) ckpt2_max_f1 = ModelCheckpoint( monitor='valid_five_f1', save_top_k=cfg.ckpt_save_top_k, mode='max', save_last=cfg.ckpt_save_last, filename=os.path.join( "checkpoint", f"best-f1-fold={cfg.fold_i}" + "-{epoch}-{valid_loss:.4f}-{valid_f1:.4f}"), ) early = EarlyStopping( monitor='valid_five_f1', patience=cfg.early_patience, mode='max', verbose=True ) lr_monitor = LearningRateMonitor('step') swa = pl.callbacks.StochasticWeightAveraging() # all callbacks callbacks = [ckpt_min_loss, ckpt2_max_f1, early, lr_monitor, swa] # init model if cfg.two_head: model = LitPlantModule2(cfg) else: model = LitPlantModule(cfg) # init dataloaders dataloaders = generate_dataloaders(cfg, train_data, val_data, transforms) # init trainer trainer = pl.Trainer( fast_dev_run=False, gpus=1, callbacks=callbacks, logger=loggers, min_epochs=cfg.min_epochs, max_epochs=cfg.max_epochs, val_check_interval=0.25, progress_bar_refresh_rate=1, weights_summary='top', precision=cfg.precision, # limit_train_batches=1, # limit_val_batches=1, gradient_clip_val=cfg.gradient_clip_val, accumulate_grad_batches=int(64 / cfg.batch_size) ) trainer.fit(model=model, train_dataloader=dataloaders['train'], val_dataloaders=dataloaders['val'])
# print(self.retriever.predict('I am beautiful lady?', ['You are a pretty girl', # 'apple is tasty', # 'He is a handsome boy'], True)) if __name__ == '__main__': encoder_question = BertEncoder(bert_question, max_question_len_global) encoder_paragarph = BertEncoder(bert_paragraph, max_paragraph_len_global) ret = Retriver(encoder_question, encoder_paragarph, tokenizer) os.makedirs('out', exist_ok=True) checkpoint_callback = ModelCheckpoint( filepath= 'out/largebatch-crossentropy-{epoch}-{val_loss:.2f}-{val_acc:.2f}', save_top_k=10, verbose=True, monitor='val_acc', mode='max') early_stopping = EarlyStopping('val_acc', mode='max') trainer = pl.Trainer(gpus=8, distributed_backend='dp', val_check_interval=0.1, min_epochs=1, checkpoint_callback=checkpoint_callback, early_stop_callback=early_stopping) ret_trainee = RetriverTrainer(ret) trainer.fit(ret_trainee)
def main(): logger = logging.getLogger(__name__) start_time = datetime.datetime.now() model_args, training_args = load_or_parse_args((ModelArgs, TrainingArgs), verbose=True) train_orig_df, label_enc = load_train_dataframe(training_args.data_train, min_class_samples=training_args.min_class_samples) # assert training_args.test_size % training_args.batch_size == 0, "Test size should be multiple of batch size" # TODO: split DFs once and keep those on the disk. Reload label_enc from disk on resume. train_df, valid_df = train_test_split(train_orig_df, test_size=training_args.test_size, stratify=train_orig_df.landmark_id, random_state=SEED) num_classes = train_df.landmark_id.nunique() if training_args.min_class_samples is None else len(label_enc.classes_) logger.info(f'Num classes train: {num_classes}') logger.info(f'Num classes valid: {valid_df.landmark_id.nunique()}') # save checkpoints training_args.checkpoints_dir.mkdir(exist_ok=True, parents=True) joblib.dump(label_enc, filename=training_args.checkpoints_dir / training_args.label_encoder_filename) logger.info(f'Persisted LabelEncoder to {training_args.label_encoder_filename}') save_config_checkpoint(training_args.checkpoints_dir) logger.info('Initializing the model') model = LandmarkModel(model_name=model_args.model_name, n_classes=num_classes, loss_module=model_args.loss_module, pooling_name=model_args.pooling_name, args_pooling=model_args.args_pooling, normalize=model_args.normalize, use_fc=model_args.use_fc, fc_dim=model_args.fc_dim, dropout=model_args.dropout ) logger.info("Model params:") logger.info(pformat(model_args)) lit_module = LandmarksPLBaseModule(hparams={**model_args.__dict__, **training_args.__dict__}, model=model, optimizer=training_args.optimizer, loss=model_args.loss_module) # init data dm = LandmarksDataModule(train_df, valid_df, hparams=training_args, image_dir=training_args.data_path, batch_size=training_args.batch_size, num_workers=training_args.num_workers, use_weighted_sampler=training_args.use_weighted_sampler ) # train dt_str = datetime.datetime.now().strftime("%y%m%d_%H-%M") wandb_logger = WandbLogger(name=f'{model_args.model_name.capitalize()}_GeM_ArcFace_{dt_str}', save_dir='logs/', project='landmarks') checkpoint_callback = ModelCheckpoint(monitor='val_acc', mode='max', save_top_k=2, save_last=True, verbose=True) # hack around to change only filename, not provide the full path (which is generated by W&B) checkpoint_callback.filename = '{epoch}-{val_acc:.3f}' early_stopping_callback = EarlyStopping('val_acc', verbose=True, mode='max') trainer = pl.Trainer(gpus=training_args.gpus, logger=wandb_logger, max_epochs=training_args.n_epochs, val_check_interval=training_args.val_check_interval, checkpoint_callback=checkpoint_callback, progress_bar_refresh_rate=100, resume_from_checkpoint=training_args.resume_checkpoint, gradient_clip_val=training_args.gradient_clip_val, accumulate_grad_batches=training_args.accumulate_grad_batches, early_stop_callback=early_stopping_callback, # fast_dev_run=True, # limit_train_batches=5, # limit_val_batches=5 ) trainer.fit(lit_module, datamodule=dm) try: training_args.checkpoints_dir = get_wandb_logger_checkpoints_path(wandb_logger) logger.info(f'Saving checkpoints to the current directory: {training_args.checkpoints_dir}') except: logger.warning(f'Unable to get current checkpoints directory, using default one: ' f'{training_args.checkpoints_dir}') # save checkpoints (saved twice - in default directory above and in wandb current run folder) training_args.checkpoints_dir.mkdir(exist_ok=True, parents=True) joblib.dump(label_enc, filename=training_args.checkpoints_dir / training_args.label_encoder_filename) logger.info(f'Persisted LabelEncoder to {training_args.label_encoder_filename}') save_config_checkpoint(training_args.checkpoints_dir) end_time = datetime.datetime.now() logger.info('Duration: {}'.format(end_time - start_time))
def main(hparams) -> None: """ Main training routine specific for this project :param hparams: """ set_seed(hparams.seed) # ------------------------ # 1 INIT LIGHTNING MODEL AND DATA # ------------------------ model = Classifier(hparams) # ------------------------ # 2 INIT EARLY STOPPING # ------------------------ early_stop_callback = EarlyStopping( monitor=hparams.monitor, min_delta=0.0, patience=hparams.patience, verbose=True, mode=hparams.metric_mode, ) # ------------------------ # 3 INIT LOGGERS # ------------------------ # Tensorboard Callback tb_logger = TensorBoardLogger( save_dir="experiments/", version="version_" + datetime.now().strftime("%d-%m-%Y--%H-%M-%S"), name="", ) # Model Checkpoint Callback ckpt_path = os.path.join( "experiments/", tb_logger.version, "checkpoints", ) # -------------------------------- # 4 INIT MODEL CHECKPOINT CALLBACK # ------------------------------- checkpoint_callback = ModelCheckpoint( filepath=ckpt_path, save_top_k=hparams.save_top_k, verbose=True, monitor=hparams.monitor, period=1, mode=hparams.metric_mode, save_weights_only=True ) # ------------------------ # 5 INIT TRAINER # ------------------------ trainer = Trainer( logger=tb_logger, checkpoint_callback=True, # callbacks=early_stop_callback, gradient_clip_val=1.0, gpus=hparams.gpus, log_gpu_memory="all", deterministic=True, check_val_every_n_epoch=1, fast_dev_run=hparams.fast_dev_run, accumulate_grad_batches=hparams.accumulate_grad_batches, max_epochs=hparams.max_epochs, min_epochs=hparams.min_epochs, # val_check_interval=hparams.val_check_interval, # distributed_backend="None", ) # ------------------------ # 6 START TRAINING # ------------------------ trainer.fit(model, model.data)
def test_callbacks_and_logger_not_called_with_fastdevrun(tmpdir, fast_dev_run): """Test that ModelCheckpoint, EarlyStopping and Logger are turned off with fast_dev_run.""" class FastDevRunModel(BoringModel): def __init__(self): super().__init__() self.training_step_call_count = 0 self.training_epoch_end_call_count = 0 self.validation_step_call_count = 0 self.validation_epoch_end_call_count = 0 self.test_step_call_count = 0 def training_step(self, batch, batch_idx): self.log("some_metric", torch.tensor(7.0)) self.logger.experiment.dummy_log("some_distribution", torch.randn(7) + batch_idx) self.training_step_call_count += 1 return super().training_step(batch, batch_idx) def training_epoch_end(self, outputs): self.training_epoch_end_call_count += 1 super().training_epoch_end(outputs) def validation_step(self, batch, batch_idx): self.validation_step_call_count += 1 return super().validation_step(batch, batch_idx) def validation_epoch_end(self, outputs): self.validation_epoch_end_call_count += 1 super().validation_epoch_end(outputs) def test_step(self, batch, batch_idx): self.test_step_call_count += 1 return super().test_step(batch, batch_idx) checkpoint_callback = ModelCheckpoint() checkpoint_callback.save_checkpoint = Mock() early_stopping_callback = EarlyStopping(monitor="foo") early_stopping_callback._evaluate_stopping_criteria = Mock() trainer_config = dict( default_root_dir=tmpdir, fast_dev_run=fast_dev_run, val_check_interval=2, logger=True, log_every_n_steps=1, callbacks=[checkpoint_callback, early_stopping_callback], ) def _make_fast_dev_run_assertions(trainer, model): # check the call count for train/val/test step/epoch assert model.training_step_call_count == fast_dev_run assert model.training_epoch_end_call_count == 1 assert model.validation_step_call_count == 0 if model.validation_step is None else fast_dev_run assert model.validation_epoch_end_call_count == 0 if model.validation_step is None else 1 assert model.test_step_call_count == fast_dev_run # check trainer arguments assert trainer.max_steps == fast_dev_run assert trainer.num_sanity_val_steps == 0 assert trainer.max_epochs == 1 assert trainer.val_check_interval == 1.0 assert trainer.check_val_every_n_epoch == 1 # there should be no logger with fast_dev_run assert isinstance(trainer.logger, DummyLogger) # checkpoint callback should not have been called with fast_dev_run assert trainer.checkpoint_callback == checkpoint_callback checkpoint_callback.save_checkpoint.assert_not_called() assert not os.path.exists(checkpoint_callback.dirpath) # early stopping should not have been called with fast_dev_run assert trainer.early_stopping_callback == early_stopping_callback early_stopping_callback._evaluate_stopping_criteria.assert_not_called() train_val_step_model = FastDevRunModel() trainer = Trainer(**trainer_config) trainer.fit(train_val_step_model) trainer.test(train_val_step_model) assert trainer.state.finished, f"Training failed with {trainer.state}" _make_fast_dev_run_assertions(trainer, train_val_step_model) # ----------------------- # also called once with no val step # ----------------------- train_step_only_model = FastDevRunModel() train_step_only_model.validation_step = None trainer = Trainer(**trainer_config) trainer.fit(train_step_only_model) trainer.test(train_step_only_model) assert trainer.state.finished, f"Training failed with {trainer.state}" _make_fast_dev_run_assertions(trainer, train_step_only_model)
'num_features': training_input.shape[3], 'num_timesteps_input': num_timesteps_input, 'num_timesteps_output': num_timesteps_output, 'gcn_type': gcn_type, 'gcn_package': gcn_package, 'gcn_partition': gcn_partition }) net = WrapperNet(hparams) net.init_data(training_input, training_target, val_input, val_target, test_input, test_target) net.init_graph(A, edge_index, edge_weight) early_stop_callback = EarlyStopping(patience=early_stop_rounds) logger = TestTubeLogger(save_dir=log_dir, name=log_name) trainer = pl.Trainer(gpus=gpus, max_epochs=epochs, distributed_backend=backend, early_stop_callback=early_stop_callback, logger=logger, track_grad_norm=2) trainer.fit(net) print('Training time {}'.format(time.time() - start_time)) # # Currently, there are some issues for testing under ddp setting, so switch it to dp setting # # change the below line with your own checkpoint path # net = WrapperNet.load_from_checkpoint('logs/ddp_exp/version_1/checkpoints/_ckpt_epoch_2.ckpt')
def main(conf): train_set = WhamDataset(conf['data']['train_dir'], conf['data']['task'], sample_rate=conf['data']['sample_rate'], nondefault_nsrc=conf['data']['nondefault_nsrc']) val_set = WhamDataset(conf['data']['valid_dir'], conf['data']['task'], sample_rate=conf['data']['sample_rate'], nondefault_nsrc=conf['data']['nondefault_nsrc']) train_loader = DataLoader(train_set, shuffle=True, batch_size=conf['training']['batch_size'], num_workers=conf['training']['num_workers'], drop_last=True) val_loader = DataLoader(val_set, shuffle=False, batch_size=conf['training']['batch_size'], num_workers=conf['training']['num_workers'], drop_last=True) # Update number of source values (It depends on the task) conf['masknet'].update({'n_src': train_set.n_src}) # Define model and optimizer model = ConvTasNet(**conf['filterbank'], **conf['masknet']) optimizer = make_optimizer(model.parameters(), **conf['optim']) # Define scheduler scheduler = None if conf['training']['half_lr']: scheduler = ReduceLROnPlateau(optimizer=optimizer, factor=0.5, patience=5) # Just after instantiating, save the args. Easy loading in the future. exp_dir = conf['main_args']['exp_dir'] os.makedirs(exp_dir, exist_ok=True) conf_path = os.path.join(exp_dir, 'conf.yml') with open(conf_path, 'w') as outfile: yaml.safe_dump(conf, outfile) # Define Loss function. loss_func = PITLossWrapper(pairwise_neg_sisdr, pit_from='pw_mtx') system = System(model=model, loss_func=loss_func, optimizer=optimizer, train_loader=train_loader, val_loader=val_loader, scheduler=scheduler, config=conf) # Define callbacks checkpoint_dir = os.path.join(exp_dir, 'checkpoints/') checkpoint = ModelCheckpoint(checkpoint_dir, monitor='val_loss', mode='min', save_top_k=5, verbose=1) early_stopping = False if conf['training']['early_stop']: early_stopping = EarlyStopping(monitor='val_loss', patience=10, verbose=1) # Don't ask GPU if they are not available. if not torch.cuda.is_available(): print('No available GPU were found, set gpus to None') conf['main_args']['gpus'] = None trainer = pl.Trainer( max_epochs=conf['training']['epochs'], checkpoint_callback=checkpoint, early_stop_callback=early_stopping, default_save_path=exp_dir, gpus=conf['main_args']['gpus'], distributed_backend='dp', train_percent_check=1.0, # Useful for fast experiment gradient_clip_val=5.) trainer.fit(system) best_k = {k: v.item() for k, v in checkpoint.best_k_models.items()} with open(os.path.join(exp_dir, "best_k_models.json"), "w") as f: json.dump(best_k, f, indent=0) # Save best model (next PL version will make this easier) best_path = [b for b, v in best_k.items() if v == min(best_k.values())][0] state_dict = torch.load(best_path) system.load_state_dict(state_dict=state_dict['state_dict']) system.cpu() to_save = system.model.serialize() to_save.update(train_set.get_infos()) torch.save(to_save, os.path.join(exp_dir, 'best_model.pth'))
def validation_epoch_end(self, outputs): losses = [8, 4, 2, 3, 4, 5, 8, 10] val_loss = losses[self.current_epoch] self.log('abc', torch.tensor(val_loss)) self.log('cba', torch.tensor(0)) def on_train_end(self) -> None: assert self.trainer.current_epoch == self.expected_end_epoch, 'Early Stopping Failed' @pytest.mark.parametrize( "callbacks, expected_stop_epoch, accelerator, num_processes", [ ([ EarlyStopping(monitor='abc'), EarlyStopping(monitor='cba', patience=3) ], 3, None, 1), ([ EarlyStopping(monitor='cba', patience=3), EarlyStopping(monitor='abc') ], 3, None, 1), pytest.param([ EarlyStopping(monitor='abc'), EarlyStopping(monitor='cba', patience=3) ], 3, 'ddp_cpu', 2, marks=RunIf(skip_windows=True)), pytest.param([
def main(conf): train_set = WhamDataset(conf['data']['train_dir'], conf['data']['task'], sample_rate=conf['data']['sample_rate'], nondefault_nsrc=conf['data']['nondefault_nsrc']) val_set = WhamDataset(conf['data']['valid_dir'], conf['data']['task'], sample_rate=conf['data']['sample_rate'], nondefault_nsrc=conf['data']['nondefault_nsrc']) train_loader = DataLoader(train_set, shuffle=True, batch_size=conf['data']['batch_size'], num_workers=conf['data']['num_workers'], drop_last=True) val_loader = DataLoader(val_set, shuffle=True, batch_size=conf['data']['batch_size'], num_workers=conf['data']['num_workers'], drop_last=True) # Update number of source values (It depends on the task) conf['masknet'].update({'n_src': train_set.n_src}) # Define model and optimizer in a local function (defined in the recipe). # Two advantages to this : re-instantiating the model and optimizer # for retraining and evaluating is straight-forward. model, optimizer = make_model_and_optimizer(conf) # Define scheduler scheduler = None if conf['training']['half_lr']: scheduler = ReduceLROnPlateau(optimizer=optimizer, factor=0.5, patience=5) # Just after instantiating, save the args. Easy loading in the future. exp_dir = conf['main_args']['exp_dir'] os.makedirs(exp_dir, exist_ok=True) conf_path = os.path.join(exp_dir, 'conf.yml') with open(conf_path, 'w') as outfile: yaml.safe_dump(conf, outfile) # Define Loss function. loss_func = PITLossWrapper(pairwise_neg_sisdr, mode='pairwise') system = System(model=model, loss_func=loss_func, optimizer=optimizer, train_loader=train_loader, val_loader=val_loader, scheduler=scheduler, config=conf) # Define callbacks checkpoint_dir = os.path.join(exp_dir, 'checkpoints/') checkpoint = ModelCheckpoint(checkpoint_dir, monitor='val_loss', mode='min', save_top_k=5, verbose=1) early_stopping = False if conf['training']['early_stop']: early_stopping = EarlyStopping(monitor='val_loss', patience=10, verbose=1) # Don't ask GPU if they are not available. if not torch.cuda.is_available(): print('No available GPU were found, set gpus to None') conf['main_args']['gpus'] = None trainer = pl.Trainer(max_nb_epochs=conf['training']['epochs'], checkpoint_callback=checkpoint, early_stop_callback=early_stopping, default_save_path=exp_dir, gpus=conf['main_args']['gpus'], distributed_backend='dp', train_percent_check=1.0, # Useful for fast experiment gradient_clip_val=5.) trainer.fit(system) with open(os.path.join(exp_dir, "best_k_models.json"), "w") as f: json.dump(checkpoint.best_k_models, f, indent=0)
) validation = TimeSeriesDataSet.from_dataset(training, data, min_prediction_idx=training_cutoff) batch_size = 128 train_dataloader = training.to_dataloader(train=True, batch_size=batch_size, num_workers=2) val_dataloader = validation.to_dataloader(train=False, batch_size=batch_size, num_workers=2) early_stop_callback = EarlyStopping(monitor="val_loss", min_delta=1e-4, patience=10, verbose=False, mode="min") trainer = pl.Trainer( max_epochs=100, gpus=0, weights_summary="top", gradient_clip_val=0.1, early_stop_callback=early_stop_callback, limit_train_batches=15, # limit_val_batches=1, # fast_dev_run=True, # logger=logger, # profiler=True, )
return self._val_dataloader if __name__ == "__main__": parser = ArgumentParser() parser.add_argument( "--data-dir", type=str, required=True ) args = parser.parse_args() early_stop_callback = EarlyStopping( monitor="val_loss", min_delta=0.0, patience=1, verbose=True, mode="min", ) trainer = pl.Trainer( gpus=1, early_stop_callback=early_stop_callback, # train_percent_check=0.001, # val_percent_check=0.001, # max_nb_epochs=1 ) model = Model(args) trainer.fit(model)
def main(hparams): neptune_logger = NeptuneLogger( api_key= "eyJhcGlfYWRkcmVzcyI6Imh0dHBzOi8vdWkubmVwdHVuZS5haSIsImFwaV91cmwiOiJodHRwczovL3VpLm5lcHR1bmUuYWkiLCJhcGlfa2V5IjoiN2I2ZWM0NmQtNjg0NS00ZjM5LTkzNTItN2I4Nzc0YTUzMmM0In0=", project_name="hirune924/kaggle-PANDA", close_after_fit=False, upload_source_files=['*.py', '*.ipynb'], params=vars(hparams), experiment_name=hparams.experiment_name, # Optional, #tags=["pytorch-lightning", "mlp"] # Optional, ) ''' comet_logger = CometLogger( api_key="QCxbRVX2qhQj1t0ajIZl2nk2c", workspace='hirune924', # Optional save_dir='.', # Optional project_name="kaggle-panda", # Optional #rest_api_key=os.environ.get('COMET_REST_API_KEY'), # Optional #experiment_name='default' # Optional )''' tb_logger = loggers.TensorBoardLogger(save_dir=hparams.log_dir, name='default', version=None) logger_list = [tb_logger, neptune_logger ] if hparams.distributed_backend != 'ddp' else tb_logger checkpoint_callback = ModelCheckpoint(filepath=os.path.join( hparams.log_dir, '{epoch}-{avg_val_loss}-{val_qwk}'), save_top_k=10, verbose=True, monitor='avg_val_loss', mode='min', save_weights_only=True, period=1) # default used by the Trainer early_stop_callback = EarlyStopping(monitor='avg_val_loss', patience=20, min_delta=0.0, strict=True, verbose=True, mode='min') seg_model = get_seg_model_from_name(model_name=hparams.seg_model_name, in_channels=5, num_classes=2, pretrained=True) seg_ckpt_pth = glob.glob( os.path.join(hparams.seg_ckpt_dir, 'fold' + str(hparams.fold) + '*.ckpt')) seg_model = load_pytorch_model(seg_ckpt_pth[0], seg_model) if hparams.marge_type == 'cat': in_channels = 7 elif hparams.marge_type == 'add': in_channels = 3 cls_model = get_cls_model_from_name(model_name=hparams.cls_model_name, in_channels=in_channels, num_classes=1, pretrained=True) pl_model = PLImageSegmentationClassificationSystem(seg_model, cls_model, hparams) ### if hparams.auto_lr_find: trainer = Trainer() lr_finder = trainer.lr_find(pl_model) print(lr_finder.results) print(lr_finder.suggestion()) pl_model.learning_rate = lr_finder.suggestion() ### trainer = Trainer(gpus=hparams.gpus, max_epochs=hparams.max_epochs, min_epochs=hparams.min_epochs, max_steps=None, min_steps=None, checkpoint_callback=checkpoint_callback, early_stop_callback=early_stop_callback, logger=logger_list, accumulate_grad_batches=1, precision=hparams.precision, amp_level='O1', auto_lr_find=False, benchmark=True, check_val_every_n_epoch=hparams.check_val_every_n_epoch, distributed_backend=hparams.distributed_backend, num_nodes=1, fast_dev_run=False, gradient_clip_val=0.0, log_gpu_memory=None, log_save_interval=100, num_sanity_val_steps=5, overfit_pct=0.0) # fit model ! trainer.fit(pl_model)
num_workers = 4 DATA_DIR = "C:\\Users\\ahrn1e19\\multiclass\\image_multiclass" root_path = 'H:\\MSc-project' checkpoint_callback = ModelCheckpoint( filepath=root_path, save_top_k=1, verbose=True, monitor='val_loss', mode='min' ) early_stop_callback = EarlyStopping( monitor='val_loss', min_delta=0.00, patience=10, verbose=False, mode='auto' ) model = torch.hub.load('pytorch/vision:v0.6.0', 'inception_v3', num_classes=5, aux_logits=False, transform_input=False, pretrained=False) model.train() # model pl_model = InceptionV3(model) # most basic trainer, uses good defaults trainer = Trainer(default_root_dir=root_path, gpus=1, max_epochs=35, checkpoint_callback=checkpoint_callback, early_stop_callback=early_stop_callback)
def test_min_steps_override_early_stopping_functionality(tmpdir, step_freeze: int, min_steps: int, min_epochs: int): """Excepted Behaviour: IF `min_steps` was set to a higher value than the `trainer.global_step` when `early_stopping` is being triggered, THEN the trainer should continue until reaching `trainer.global_step` == `min_steps`, and stop. IF `min_epochs` resulted in a higher number of steps than the `trainer.global_step` when `early_stopping` is being triggered, THEN the trainer should continue until reaching `trainer.global_step` == `min_epochs * len(train_dataloader)`, and stop. This test validate this expected behaviour IF both `min_epochs` and `min_steps` are provided and higher than the `trainer.global_step` when `early_stopping` is being triggered, THEN the highest between `min_epochs * len(train_dataloader)` and `min_steps` would be reached. Caveat: IF min_steps is divisible by len(train_dataloader), then it will do min_steps + len(train_dataloader) This test validate those expected behaviours """ _logger.disabled = True original_loss_value = 10 limit_train_batches = 3 patience = 3 class Model(BoringModel): def __init__(self, step_freeze): super(Model, self).__init__() self._step_freeze = step_freeze self._loss_value = 10.0 self._eps = 1e-1 self._count_decrease = 0 self._values = [] def training_step(self, batch, batch_idx): output = self.layer(batch) loss = self.loss(batch, output) return {"loss": loss} def validation_step(self, batch, batch_idx): return {"test_val_loss": self._loss_value} def validation_epoch_end(self, outputs): _mean = np.mean([x['test_val_loss'] for x in outputs]) if self.trainer.global_step <= self._step_freeze: self._count_decrease += 1 self._loss_value -= self._eps self._values.append(_mean) self.log('test_val_loss', _mean) model = Model(step_freeze) model.training_step_end = None model.test_dataloader = None early_stop_callback = EarlyStopping(monitor="test_val_loss", patience=patience, verbose=True) trainer = Trainer( default_root_dir=tmpdir, callbacks=[early_stop_callback], limit_train_batches=limit_train_batches, limit_val_batches=2, min_steps=min_steps, min_epochs=min_epochs, ) trainer.fit(model) # Make sure loss was properly decreased assert abs(original_loss_value - (model._count_decrease) * model._eps - model._loss_value) < 1e-6 pos_diff = (np.diff(model._values) == 0).nonzero()[0][0] # Compute when the latest validation epoch end happened latest_validation_epoch_end = (pos_diff // limit_train_batches) * limit_train_batches if pos_diff % limit_train_batches == 0: latest_validation_epoch_end += limit_train_batches # Compute early stopping latest step by_early_stopping = latest_validation_epoch_end + (1 + limit_train_batches) * patience # Compute min_epochs latest step by_min_epochs = min_epochs * limit_train_batches # Make sure the trainer stops for the max of all minimum requirements assert trainer.global_step == max(min_steps, by_early_stopping, by_min_epochs), ( trainer.global_step, max(min_steps, by_early_stopping, by_min_epochs), step_freeze, min_steps, min_epochs, ) _logger.disabled = False
download=True, transform=transforms.ToTensor()) test_loader = DataLoader(test_dataset) logger.info(f"Done!" f"\n# of train examples: {n_train}" f"\n# of val examples: {n_val}" f"\n# of test examples: {len(test_dataset)}") # init model model = LitModel(args) if args.patience is not None: early_stop_ckpt = EarlyStopping(monitor='val_loss', verbose=True, patience=args.patience) else: early_stop_ckpt = None profiler = SimpleProfiler() lightning_log_pth = '/lightning_logs' if not os.path.isdir(lightning_log_pth): logger.warning(f"Unable to find {lightning_log_pth} to log to! " f"If not running Grid then ignore.") save_dir = '' else: save_dir = lightning_log_pth
if (not os.path.exists(args.ckpt_path)): print('Creating CKPT Dir') os.mkdir(args.ckpt_path) ''' checkpoint_callback = ModelCheckpoint( filepath=args.ckpt_path, save_top_k=True, verbose=True, monitor='val_loss', mode='min', prefix='' ) ''' early_stop_callback = EarlyStopping(monitor='val_loss', min_delta=0.00005, patience=3, verbose=False, mode='min') for trial in range(args.num_runs): args.__dict__["rand"] = trial + 1 trainer = pl.Trainer(default_save_path=args.ckpt_path, distributed_backend=args.distributed_backend, gpus=len(args.gpus.split(',')), max_epochs=args.e, early_stop_callback=early_stop_callback) model = LMFineTuner(args) trainer.fit(model) trainer.test(model)
default=True) parser.add_argument("-optim", default='adam', type=str) parser.add_argument("-lr", default=1e-5, type=float) parser.add_argument("-lr_bert", default=5e-4, type=float) parser.add_argument("-beta1", default=0.9, type=float) parser.add_argument("-beta2", default=0.999, type=float) parser.add_argument("-warmup_steps", default=8000, type=int) parser.add_argument("-warmup_steps_bert", default=8000, type=int) parser.add_argument("-max_grad_norm", default=0, type=float) args = parser.parse_args() print("\nArguments...\n") print(args) print("\nCreating callbacks...\n") early_stop_callback = EarlyStopping('val_p1', patience=5) modelfilepath = '../trainedmodels/' + args.model_type + "_" + args.save_mname + '-{epoch:02d}-{val_loss:.2f}' checkpoint_callback = ModelCheckpoint(filepath=modelfilepath, save_top_k=10, monitor='val_p5') logger = TensorBoardLogger("../tb_logs", name=args.model_type, version=args.save_mname) print("\nLoading Model...\n") model = LitNet(Eval(), args, mode=args.model_type) trainer = Trainer(gpus=[int(item) for item in args.gpus.split(',')], min_epochs=1, max_epochs=args.epochs, distributed_backend='dp', profiler=True,
def main(argv): if not os.path.exists(FLAGS.logs_dir): os.makedirs(FLAGS.logs_dir) set_seed(FLAGS.seed) id2class, intent_examples = read_nlu_data() if FLAGS.do_train: if not os.path.exists(FLAGS.output_dir): os.makedirs(FLAGS.output_dir) model = NluClassifier(id2class, intent_examples) early_stop_callback = EarlyStopping( monitor=FLAGS.monitor, min_delta=0.0, patience=FLAGS.patience, verbose=True, mode=FLAGS.metric_mode, ) checkpoint_callback = ModelCheckpoint(filepath=FLAGS.output_dir, save_top_k=3, monitor=FLAGS.monitor, mode=FLAGS.metric_mode, prefix='nlu_') trainer = pl.Trainer( default_root_dir='logs', gpus=(FLAGS.gpus if torch.cuda.is_available() else 0), distributed_backend='dp', max_epochs=FLAGS.epochs, fast_dev_run=FLAGS.debug, logger=pl.loggers.TensorBoardLogger('logs/', name='nlu', version=0), checkpoint_callback=checkpoint_callback, early_stop_callback=early_stop_callback) trainer.fit(model) if FLAGS.do_predict: from sanic import Sanic, response server = Sanic() checkpoints = list( sorted( glob(os.path.join(FLAGS.output_dir, "nlu_*.ckpt"), recursive=True))) model = NluClassifier.load_from_checkpoint( checkpoint_path=checkpoints[-1], id2class=id2class, intent_examples=intent_examples) model.eval() model.freeze() @server.route("/parse", methods=['POST']) async def parse(request): texts = request.json prediction = model.predict(texts) return response.json(prediction) server.run(host="0.0.0.0", port=5000, debug=True)
def train(exp_name, gpus): print("Start") file = open('data/nela-covid-2020/combined/headlines_cnn_bart_split.pkl', 'rb') # file = open('data/nela-covid-2020/combined/headlines_contentmorals_cnn_bart_split.pkl', 'rb') data = pickle.load(file) file.close() print("Data Loaded") # create datasets # train_dataset = NewsDataset(data['train'][0:1]) train_dataset = NewsDataset(data['train']) val_dataset = NewsDataset(data['val']) test_dataset = NewsDataset(data['test']) embedding_dataset = EmbeddingDataset() train_loader = DataLoader(train_dataset, batch_size=32, num_workers=4) val_loader = DataLoader(val_dataset, batch_size=32, num_workers=4) # train_loader = DataLoader(train_dataset, batch_size=16, num_workers=4) # val_loader = DataLoader(val_dataset, batch_size=16, num_workers=4) # train_loader = DataLoader(embedding_dataset, batch_size=32, num_workers=4) # train_loader = DataLoader(embedding_dataset, batch_size=512, num_workers=4) # val_loader = DataLoader(embedding_dataset, batch_size=64, num_workers=4) # ------------ # training # ------------ LEARNING_RATE = 1e-5 hparams = {'lr': LEARNING_RATE} model = OneHotMoralClassifier(hparams, use_mask=False) # model = CustomMoralClassifier(hparams) # model = MoralClassifier(hparams) # model = PseudoEmbedding(hparams) early_stop_callback = EarlyStopping(monitor='val_loss', min_delta=0.00, patience=3, verbose=True, mode='auto') checkpoint_callback = ModelCheckpoint(dirpath=os.path.join( "./experiments", exp_name, "checkpoints"), save_top_k=1, monitor='train_loss', mode='min') trainer = Trainer( gpus=gpus, # auto_lr_find=False, # use to explore LRs # distributed_backend='dp', max_epochs=20, callbacks=[early_stop_callback, checkpoint_callback], ) # LR Exploration # lr_finder = trainer.tuner.lr_find(model, train_loader, val_loader) # print(lr_finder.results) # fig = lr_finder.plot(suggest=True) # # fig.show() # # fig.savefig('lr.png') # new_lr = lr_finder.suggestion() # print(new_lr) trainer.fit(model, train_loader, val_loader) print("Training Done")
val_dataset = create_dataset(val['X'], val['y'], model_name, 256, num_classes=2) train_dataloader = DataLoader(train_dataset, batch_size=64, num_workers=3, shuffle=True) val_dataloader = DataLoader(val_dataset, batch_size=64, num_workers=3, shuffle=False) early_stopping = EarlyStopping('val_accuracy', patience=6, mode='max') model_checkpoint = ModelCheckpoint(monitor='val_accuracy', mode='max', save_top_k=1) trainer = pl.Trainer( deterministic=True, weights_save_path=f'checkpoints/{checkpoint_directory}/', logger=wandb_logger, early_stop_callback=early_stopping, checkpoint_callback=model_checkpoint, distributed_backend='dp', gpus=None, # gradient_clip_val=0.5, num_sanity_val_steps=-1, min_epochs=100)
def get_early_stop_callback(patience=10): return EarlyStopping(monitor='val_loss', patience=patience, verbose=True, mode='min')
def get_early_stopping_callback(metric, patience): return EarlyStopping(monitor=f"val_{metric}", mode="max", patience=patience, verbose=True,)
else: suggested_lr = float(lr_find_config.loc[args.model_name, 'lr']) print(f'Reading LR {suggested_lr} from archive config.') model.lr = suggested_lr # Need to manually update, similar to doc. # Reference: https://pytorch-lightning.readthedocs.io/en/latest/lr_finder.html model.hparams.lr = suggested_lr if args.logger_platform == 'wandb': logger = WandbLogger(project="ptb-xl") elif args.logger_platform == 'tensorboard': logger = TensorBoardLogger(args.log_dir, name='') model.log_dir = args.log_dir early_stopping_callback = EarlyStopping( verbose=True, monitor='val_epoch_loss', mode='min', patience=5) if args.early_stopping else None checkpoint_callback = args.checkpoint_models and int( os.environ.get('LOCAL_RANK', 0)) == 0 progress_bar_callback = ProgressBar() batch_gradient_verification_callback = BatchGradientVerificationCallback( output_mapping=lambda output: output['loss']) batch_norm_verification_callback = BatchNormVerificationCallback() module_monitor_callback = ModuleDataMonitor(submodules=True) # Resetting trainer due to some issue with threading otherwise trainer = Trainer.from_argparse_args( args, checkpoint_callback=checkpoint_callback, deterministic=True, logger=logger,