def train_transfer_learning(): logger = TensorBoardLogger('runs', name='pc-gita') batch_size = 32 input_height = 224 num_workers = 4 train_dataset = PcGitaTorchDataset(transform=SimCLRTrainDataTransform( input_height=input_height, gaussian_blur=False), train=True) val_dataset = PcGitaTorchDataset(transform=SimCLRTrainDataTransform( input_height=input_height, gaussian_blur=False), train=False) train_loader = DataLoader(train_dataset, batch_size=batch_size, num_workers=num_workers) test_loader = DataLoader(val_dataset, batch_size=batch_size, num_workers=num_workers) model = ImagenetTransferLearning() checkpoint_callback = ModelCheckpoint( monitor="val_loss", dirpath=r'D:\Users\lVavrek\research\data', filename="transfer-learning-pcgita-{epoch:02d}-{val_loss:.2f}", save_top_k=1, mode="min", ) # early_stopping = EarlyStopping(monitor="val_loss") trainer = Trainer(gpus=1, callbacks=[checkpoint_callback], logger=logger, max_epochs=20) trainer.fit(model, train_loader, test_loader)
def main(hparams, cluster): ''' Once we receive this round's parameters we can load our model and our datamodule. We kept the trainer's parameters fixed for convenience and avoided using functionality that would make it hard to compare between models. ''' dm = FlickrDataModule(batch_size = hparams.batch_size, num_workers = hparams.num_workers) dm.setup() # each trial has a separate version number which can be accessed from the cluster train, pad_idx = get_dataset( "../../data/flickr8k/images", "../../data/flickr8k/training_captions.txt", dm.transform) vocab_size = len(train.vocab) # loading our model with this run's parameters model = CaptionGenerator(embed_size = hparams.embed_size, hidden_size = hparams.hidden_size, vocab_size = vocab_size, num_layers = hparams.num_layers, batch_size = hparams.batch_size, pad_idx = pad_idx) logger = TensorBoardLogger(save_dir = '../../data/caption_generator/', version = cluster.hpc_exp_number, name = 'lightning_logs') trainer = Trainer(logger = logger, gpus = 2, num_nodes = 13, max_epochs = 1000, auto_select_gpus = True, profiler = True, distributed_backend='ddp', early_stop_callback=False) trainer.fit(model, dm)
def test_integration(dataloaders_fixed_window_without_coveratiates, tmp_path, gpus): train_dataloader = dataloaders_fixed_window_without_coveratiates["train"] val_dataloader = dataloaders_fixed_window_without_coveratiates["val"] early_stop_callback = EarlyStopping(monitor="val_loss", min_delta=1e-4, patience=1, verbose=False, mode="min") logger = TensorBoardLogger(tmp_path) checkpoint = ModelCheckpoint(filepath=tmp_path) trainer = pl.Trainer( checkpoint_callback=checkpoint, max_epochs=3, gpus=gpus, weights_summary="top", gradient_clip_val=0.1, early_stop_callback=early_stop_callback, fast_dev_run=True, logger=logger, ) net = NBeats.from_dataset( train_dataloader.dataset, learning_rate=0.15, log_gradient_flow=True, widths=[4, 4, 4], log_interval=1000 ) net.size() try: trainer.fit( net, train_dataloader=train_dataloader, val_dataloaders=val_dataloader, ) # check loading fname = f"{trainer.checkpoint_callback.dirpath}/epoch=0.ckpt" net = NBeats.load_from_checkpoint(fname) # check prediction net.predict(val_dataloader, fast_dev_run=True, return_index=True, return_decoder_lengths=True) finally: shutil.rmtree(tmp_path, ignore_errors=True) net.predict(val_dataloader, fast_dev_run=True, return_index=True, return_decoder_lengths=True)
def run_train_fragmentized_lstm(): hp = { 'input_dim': 200, 'hidden_dim': 200, 'output_dim': 4, 'layer_dim': 1, 'bidirectional': False, 'dropout': 0.0, 'batch_size': 32, 'learning_rate': 9e-5, 'weight_decay': 1e-4, 'max_num_words': 64, 'removing_stop_words': True, 'lemmatization': False } name = get_tensorboard_log_name(hp) logger = TensorBoardLogger(name=name, save_dir=os.path.join(os.getcwd(), '../lightning_logs', 'LSTM')) my_trainer = pl.Trainer(logger=logger, max_epochs=20, early_stop_callback=EarlyStopping( monitor='val_loss', mode='min', patience=3, verbose=True), gpus=1) model = FragmentizedLSTMClassifier(**hp) my_trainer.fit(model) model_name = name + '_' + datetime.now().strftime( '%m-%d-%Y_%H.%M.%S') + '.pt' project_directory = os.path.dirname( os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) model_path = os.path.join(project_directory, 'models', 'lstm', 'saved_models', model_name) torch.save(model.state_dict(), model_path)
def train_application_classification_cnn_model(data_path, model_path, gpu, trainset, valset): # os.environ['CUDA_VISIBLE_DEVICES']='0,1' logger = TensorBoardLogger('application_classification_cnn_logs', 'application_classification_cnn') # train_cnn(c1_kernel_size=4, c1_output_dim=200, c1_stride=3, c2_kernel_size=5, c2_output_dim=200, c2_stride=1, # output_dim=17, data_path=data_path, epoch=300, gpus=gpu, model_path=model_path, signal_length=1500, # logger=logger) train_cnn(c1_kernel_size=4, c1_output_dim=200, c1_stride=3, c2_kernel_size=5, c2_output_dim=200, c2_stride=1, output_dim=11, data_path=data_path, epoch=100, gpus=gpu, model_path=model_path, signal_length=1500, logger=logger, trainset=trainset, valset=valset)
def get_logger(log_prefix: str, args, model: BaseDeepextModel): test_tensor = try_cuda(torch.randn(1, 3, args.image_size, args.image_size)) if args.log_type == "mlflow": logger = MLFlowLogger( experiment_name= f"{log_prefix}_{args.dataset}_{model.generate_model_name()}") # Log the model # with mlflow.start_run(): # mlflow.pytorch.log_model(model, "model") # # # convert to scripted model and log the model # scripted_pytorch_model = torch.jit.script(model) # mlflow.pytorch.log_model(scripted_pytorch_model, "scripted_model") return logger elif args.log_type == "tensorboard": logger = TensorBoardLogger( save_dir="tensorboard_logs", version="v", name= f"segmentation_demo_{args.dataset}_{model.generate_model_name()}") logger.experiment.add_graph(model, test_tensor) return logger raise RuntimeError(f"Invalid log type: {args.log_type}")
def _set_tb_logger_and_callbacks(self, trial_name): """ Define TensorBoard logger and checkpoint callbacks. :return: """ self.tb_logger = TensorBoardLogger(save_dir=self.base_experiment_path, version=trial_name, name=None) checkpoints_name_template = '{epoch}_{val_accuracy:.3f}_{val_loss:.3f}' checkpoints_path = os.path.join(self.trial_dir, 'checkpoints', checkpoints_name_template) self.checkpoint_callback = ModelCheckpoint(filepath=checkpoints_path, save_top_k=3, verbose=True, monitor='val_accuracy', mode='max', prefix='') if self.early_stopping: self.early_stopping_callback = EarlyStopping(monitor='val_loss', min_delta=1.0e-6, patience=5, mode='min')
def config_metric_logger(args): if args.metric_logger == "wandb": from pytorch_lightning.loggers import WandbLogger # turn off wanbd console to avoid progress bar logging os.environ["WANDB_CONSOLE"] = "on" if args.wandb_console else "off" logger = WandbLogger( project=args.project_name, name=args.run_id, version=args.run_id, save_dir=Path("./runs").resolve(), ) elif args.metric_logger == "tensorboard": from pytorch_lightning.loggers import TensorBoardLogger logger = TensorBoardLogger( save_dir=Path("./runs").resolve(), version=args.run_id, name=args.project_name, ) else: logger = None return logger
def run(cfg: DictConfig) -> None: """ Run pytorch-lightning model Args: cfg: hydra config """ set_seed(cfg.training.seed) hparams = flatten_omegaconf(cfg) model = LitWheat(hparams=hparams, cfg=cfg) early_stopping = pl.callbacks.EarlyStopping( **cfg.callbacks.early_stopping.params) model_checkpoint = pl.callbacks.ModelCheckpoint( **cfg.callbacks.model_checkpoint.params) tb_logger = TensorBoardLogger(save_dir=cfg.general.save_dir) # comet_logger = CometLogger(save_dir=cfg.general.save_dir, # workspace=cfg.general.workspace, # project_name=cfg.general.project_name, # api_key=cfg.private.comet_api, # experiment_name=os.getcwd().split('\\')[-1]) json_logger = JsonLogger() trainer = pl.Trainer( logger=[tb_logger, json_logger], # comet_logger, callbacks=[early_stopping, model_checkpoint], **cfg.trainer, ) trainer.fit(model) # save as a simple torch model model_name = os.getcwd().split("\\")[-1] + ".pth" print(model_name) torch.save(model.model.state_dict(), model_name)
def _create_pl_logger(self): if self.exp_main_dir and self.cfg["training_cfg"]["pl_logger_use"]: logger = [] # Tensorboard logger tb_logger = TensorBoardLogger( save_dir=self.cfg["setup_cfg"]["exp_main_dir"], name="tensorboard", version="", # TODO For this to work, define self.example_input_array attribute in model log_graph=False, ) logger.append(tb_logger) # Wandb logger wandb_name = f"{os.sep}".join(os.path.normpath(self.cfg["setup_cfg"]["exp_main_dir"]).split(os.sep)[-2:]) wandb_name += f"{os.sep}S_{self.cfg['setup_cfg']['which_system']}" wandb_name += f"{os.sep}TP_{self.cfg['data_set_cfg']['palette_mode']}" wandb_name += f"{os.sep}M_{self.cfg['model_cfg']['which_model']}" wandb_logger = WandbLogger( entity="nikola3794", project=self.cfg["setup_cfg"]["project_name"], name=wandb_name, save_dir=self.cfg["setup_cfg"]["exp_main_dir"], id=None, ) # Log gradients in wandb wandb_logger.watch( self.system, log='gradients', log_freq=self.cfg["training_cfg"]["pl_log_every_n_steps"], ) logger.append(wandb_logger) else: logger = False return logger
def main(hparams): seed_everything(0) # If only train on 1 GPU. Must set_device otherwise PyTorch always store model on GPU 0 first if type(hparams.gpus) == str: if len(hparams.gpus) == 2: # GPU number and comma e.g. '0,' or '1,' torch.cuda.set_device(int(hparams.gpus[0])) # Model classifier = LightModule(hparams) # Trainer lr_logger = LearningRateLogger() logger = TensorBoardLogger("../logs", name=hparams.classifier) trainer = Trainer(callbacks=[lr_logger], gpus=hparams.gpus, max_epochs=hparams.max_epochs, deterministic=True, early_stop_callback=False, logger=logger) trainer.fit(classifier) # Load best checkpoint checkpoint_path = os.path.join( Path(os.getcwd()).parent, 'logs', hparams.classifier, 'version_' + str(classifier.logger.version), 'checkpoints') classifier = LightModule.load_from_checkpoint( os.path.join(checkpoint_path, os.listdir(checkpoint_path)[0])) # Save weights from checkpoint statedict_path = os.path.join(os.getcwd(), '..', 'models', hparams.classifier + '.pt') torch.save(classifier.model.state_dict(), statedict_path) # Test model trainer.test(classifier)
def run_train_fragmentized_conv_net(): hp = { 'embedding_dim': 200, 'output_dim': 4, 'dropout': 0.4, 'batch_size': 128, 'learning_rate': 3e-4, 'weight_decay': 3e-4, 'filters_number': 64, 'kernels_sizes': [5, 10, 15], 'max_num_words': 64, 'removing_stop_words': True, 'lemmatization': False } name = get_tensorboard_log_name(hp) logger = TensorBoardLogger(name=name, save_dir=os.path.join(os.getcwd(), '../lightning_logs', 'ConvNet')) my_trainer = pl.Trainer(logger=logger, max_epochs=30, early_stop_callback=EarlyStopping( monitor='val_loss', mode='min', patience=6, verbose=True), gpus=1) model = FragmentizedConvNetClassifier(**hp) my_trainer.fit(model) model_name = name + '_' + datetime.now().strftime( '%m-%d-%Y_%H.%M.%S') + '.pt' project_directory = os.path.dirname( os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) model_path = os.path.join(project_directory, 'models', 'conv_net', 'saved_models', model_name) torch.save(model.state_dict(), model_path)
def load_trainer(args): logdir = args.logdir logger = TensorBoardLogger( save_dir=os.getcwd(), name=logdir, ) loss_callback = ModelCheckpoint( monitor='val_loss', filename='checkpoint-{epoch:02d}-{val_loss:.4f}', save_top_k=-1, mode='min', ) callbacks = [loss_callback] device = args.device max_epochs = args.epochs resume_from_checkpoint = args.checkpoint if device == 'tpu': trainer = pl.Trainer(max_epochs=max_epochs, logger=logger, callbacks=callbacks, tpu_cores=8, resume_from_checkpoint=resume_from_checkpoint) elif device == 'gpu': trainer = pl.Trainer(max_epochs=max_epochs, logger=logger, callbacks=callbacks, gpus=1, precision=16, resume_from_checkpoint=resume_from_checkpoint) else: trainer = pl.Trainer(max_epochs=max_epochs, logger=logger, callbacks=callbacks, resume_from_checkpoint=resume_from_checkpoint) return trainer
def tuning(config=None, MODEL=None, pose_autoencoder=None, cost_dim=None, phase_dim=None, input_slices=None, output_slices=None, train_set=None, val_set=None, num_epochs=300, model_name="model"): trainer = pl.Trainer( max_epochs=num_epochs, gpus=1, logger=TensorBoardLogger(save_dir="logs/", name=model_name, version="0.0"), progress_bar_refresh_rate=5, callbacks=[ TuneReportCallback({ "loss": "avg_val_loss", }, on="validation_end"), EarlyStopping(monitor="avg_val_loss") ], ) model = MODEL(config=config, pose_autoencoder=pose_autoencoder, cost_input_dimension=cost_dim, phase_dim=phase_dim, input_slicers=input_slices, output_slicers=output_slices, train_set=train_set, val_set=val_set, name=model_name) trainer.fit(model)
def test_tensorboard_with_accummulated_gradients(mock_log_metrics, tmpdir): """Tests to ensure that tensorboard log properly when accumulated_gradients > 1""" class TestModel(BoringModel): def __init__(self): super().__init__() self.indexes = [] def training_step(self, *args): self.log('foo', 1, on_step=True, on_epoch=True) if not self.trainer.train_loop.should_accumulate(): if self.trainer.logger_connector.should_update_logs: self.indexes.append(self.trainer.global_step) return super().training_step(*args) model = TestModel() model.training_epoch_end = None logger_0 = TensorBoardLogger(tmpdir, default_hp_metric=False) trainer = Trainer( default_root_dir=tmpdir, limit_train_batches=12, limit_val_batches=0, max_epochs=3, accumulate_grad_batches=2, logger=[logger_0], log_every_n_steps=3, ) trainer.fit(model) calls = [m[2] for m in mock_log_metrics.mock_calls] count_epochs = [c["step"] for c in calls if "foo_epoch" in c["metrics"]] assert count_epochs == [5, 11, 17] count_steps = [c["step"] for c in calls if "foo_step" in c["metrics"]] assert count_steps == model.indexes
def main(): """main""" parser = get_parser() parser = Trainer.add_argparse_args(parser) args = parser.parse_args() model = ChnSentiClassificationTask(args) if args.pretrain_checkpoint: checkpoint = torch.load(args.pretrain_checkpoint, map_location=torch.device('cpu')) model.load_state_dict(checkpoint['state_dict'], strict=False) checkpoint_callback = ModelCheckpoint( filepath=os.path.join(args.save_path, 'checkpoint', '{epoch}-{val_loss:.4f}-{val_acc:.4f}'), save_top_k=args.save_topk, save_last=False, monitor="val_acc", mode="max", ) logger = TensorBoardLogger(save_dir=args.save_path, name='log') # save args with open(os.path.join(args.save_path, 'checkpoint', "args.json"), 'w') as f: args_dict = args.__dict__ del args_dict['tpu_cores'] json.dump(args_dict, f, indent=4) trainer = Trainer.from_argparse_args( args, checkpoint_callback=checkpoint_callback, distributed_backend="ddp", logger=logger) trainer.fit(model)
def main(args: Namespace) -> None: # ------------------------ # 1 INIT LIGHTNING MODEL # ------------------------ model = GAN(**vars(args)) # ------------------------ # 2 INIT TRAINER # ------------------------ # If use distubuted training PyTorch recommends to use DistributedDataParallel. # See: https://pytorch.org/docs/stable/nn.html#torch.nn.DataParallel tb_logger = TensorBoardLogger(save_dir="logs", name="mnist-gan") trainer = Trainer( distributed_backend="horovod", gpus=1, logger=tb_logger, max_epochs=10, prepare_data_per_node=True, ) # ------------------------ # 3 START TRAINING # ------------------------ trainer.fit(model)
def train_model(config=None): trainer = pl.Trainer( default_root_dir="/home/nuoc/Documents/MEX/src/version_0.2/checkpoints", gpus=1, precision=16, min_epochs=20, max_epochs=MAX_EPOCHS, callbacks=[ TuneReportCallback({ "loss": "avg_val_loss", }, on="validation_end") ], logger=TensorBoardLogger(save_dir="logs/", name=model_name, version="0.0"), stochastic_weight_avg=True) model = MLP_MIX(config=config, input_dims=[pose_dim], name=model_name, train_set=train_set, val_set=val_set, test_set=test_set) trainer.fit(model)
def test_pytorch_profiler_multiple_loggers(tmpdir): """Tests whether the PyTorch profiler is able to write its trace locally when the Trainer is configured with multiple loggers. See issue #8157. """ def look_for_trace(trace_dir): """Determines if a directory contains a PyTorch trace.""" return any("trace.json" in filename for filename in os.listdir(trace_dir)) # Sanity check assert not look_for_trace(tmpdir) model = BoringModel() loggers = [TensorBoardLogger(save_dir=tmpdir), CSVLogger(tmpdir)] trainer = Trainer(default_root_dir=tmpdir, profiler="pytorch", logger=loggers, limit_train_batches=5, max_epochs=1) assert len(trainer.loggers) == 2 trainer.fit(model) assert look_for_trace(tmpdir)
def declare_callbacks_and_trainer(early_stopping_patience, epochs, experiment_name): callbacks = [] early_stop_callback = EarlyStoppingWithColdStart( monitor='example_macro/macro_f1', min_delta=0.00, patience=early_stopping_patience, verbose=False, mode='max', strict=True, cold_start_epochs=10) callbacks.append(early_stop_callback) checkpoint_callback = ModelCheckpoint( monitor='example_macro/macro_f1', # dirpath='/datahdd/vmanuel/checkpoints/', dirpath='trained_models/', filename=experiment_name, mode='max', save_last=False) callbacks.append(checkpoint_callback) logger = TensorBoardLogger('lightning_logs', name=experiment_name, default_hp_metric=False) trainer = Trainer( callbacks=callbacks, logger=logger, gpus=1, max_epochs=epochs, limit_train_batches=300, # limit_val_batches=.25, precision=16) return trainer
def main(hparams): hparams.design = hparams.path.split("/")[-3].split("_")[0] slurm_id = os.environ.get("SLURM_JOBID") if slurm_id is None: version = None else: version = str(hparams.design + "_" + slurm_id) logger = TensorBoardLogger(hparams.logdir, name=hparams.exp_name, version=version) checkpoint_path = os.path.join( logger.experiment.get_logdir(), "checkpoints", "simplecnn_{epoch:02d}-{val_loss:.2e}-{roc_auc:.2f}") checkpoint_callback = ModelCheckpoint(filepath=checkpoint_path, save_top_k=5, monitor="roc_auc", mode="max") model = Challenger.load_from_checkpoint(checkpoint_path=hparams.path) trainer = Trainer.from_argparse_args(hparams) trainer.logger = logger trainer.checkpoint_callback = checkpoint_callback trainer.fit(model)
def main(): lyft_graph_dataset = LyftGraphDataset(data_dir=L5KIT_DATA_FOLDER, config_file=L5KIT_CONFIG_FILE, split="train", transform=None, pre_transform=None) train_loader = DataLoader(lyft_graph_dataset, batch_size=batch_size, shuffle=True) model = get_model() checkpoint_callback = ModelCheckpoint(monitor='train_loss', dirpath='./logs', save_top_k=save_top_k, mode='min') logger = TensorBoardLogger(save_dir="./tensorboard_logs", name='lyft_motion_pred') trainer = pl.Trainer(max_epochs=max_epochs, callbacks=[checkpoint_callback], logger=logger) trainer.fit(model, train_loader)
def train_grid_search_model(self, save_path, params=None): dataset = read_data(self.path_to_embeddings, self.path_to_labels) train_dataloader, test_dataloader = create_dataloader(dataset, test_size=0.1, shuffle=False) embeddings_size = dataset[0][0].shape[1] model = LSTMTagger self.run_records = [] params = self.create_list_of_train_hyperparameters() for param in params: print(param) loss = self.criterion() self.model = model_to_train( word_emb_dim=self.embeddings_size, lstm_hidden_dim=param.dim) optimizer = param.optim(self.model.parameters(), lr=param.lr) logger = TensorBoardLogger("tb_logs", name="PlBugLocModel") autoencoder = PlBugLocModel(model, ) trainer = pl.Trainer(logger=logger, log_every_n_steps=10) self.params = params
def fit_model_from_params(self, params=None, use_best_params=False, path_to_results='.', save_dir=None, model_name=None): dataset = read_data(self.path_to_embeddings, self.path_to_labels) self.train_dataloader, self.test_dataloader = create_dataloader(dataset, test_size=0.2) embeddings_size = dataset[0][0].shape[1] model = LSTMTagger if save_dir is None: save_dir = '.' if model_name is None: model_name = 'lstm_' + self.postfix_hash save_path = os.path.join(save_dir, model_name) if use_best_params: self.train_grid_search_model(save_path) best_params = self.model.best_params() else: best_params = params logger = TensorBoardLogger("tb_logs", name="PlBugLocModel") autoencoder = PlBugLocModel(model) trainer = pl.Trainer(logger=logger, log_every_n_steps=10, max_epochs=20, callbacks=[ModelCheckpoint(dirpath=save_dir)]) trainer.fit(autoencoder, self.train_dataloader, self.test_dataloader) self.model = autoencoder.model self.params = params autoencoder.save_model(save_path) return trainer
def separable_hnn(num_points, input_h_s=None, input_model=None, save_path='temp_save_path', train=True, epoch_save=100): """ Separable Hamiltonian network. :return: """ if input_h_s: h_s = input_h_s model = input_model else: h_s = HNN1DWaveSeparable( nn.Sequential(nn.Linear(3 * num_points, 20), nn.Tanh(), nn.Linear(20, 20), nn.Tanh(), nn.Linear(20, 20), nn.Tanh(), nn.Linear(20, 20), nn.Tanh(), nn.Linear(20, 20), nn.Tanh(), nn.Linear(20, 1))).to(device) model = DENNet(h_s, case='1DWave').to(device) if train: learn_sep = Learner(model, num_boundary=num_boundary, save_path=save_path, epoch_save=epoch_save) logger = TensorBoardLogger('separable_logs') trainer_sep = pl.Trainer(min_epochs=701, max_epochs=701, logger=logger, gpus=1) trainer_sep.fit(learn_sep) return h_s, model
def main(): system = configure_system( hyperparameter_defaults["system"])(hyperparameter_defaults) logger = TensorBoardLogger( 'experiments_logs', name=str(hyperparameter_defaults['system']) + "_" + str(system.model.__class__.__name__) + "_" + str(hyperparameter_defaults['criterion']) + "_" + str(hyperparameter_defaults['scheduler'])) early_stop = EarlyStopping(monitor="valid_iou", mode="max", verbose=True, patience=hyperparameter_defaults["patience"]) model_checkpoint = ModelCheckpoint( monitor="valid_iou", mode="max", verbose=True, filename='Model-{epoch:02d}-{valid_iou:.5f}', save_top_k=3, save_last=True) trainer = pl.Trainer( gpus=[0, 1], plugins=DDPPlugin(find_unused_parameters=True), max_epochs=hyperparameter_defaults['epochs'], logger=logger, check_val_every_n_epoch=1, accelerator='ddp', callbacks=[early_stop, model_checkpoint], num_sanity_val_steps=0, limit_train_batches=1.0, deterministic=True, ) trainer.fit(system) trainer.test(system)
def train_mnist_tune_checkpoint(config, checkpoint_dir=None, data_dir=None, num_epochs=10, num_gpus=0): trainer = pl.Trainer( max_epochs=num_epochs, # If fractional GPUs passed in, convert to int. gpus=math.ceil(num_gpus), logger=TensorBoardLogger( save_dir=tune.get_trial_dir(), name="", version="."), progress_bar_refresh_rate=0, callbacks=[ TuneReportCheckpointCallback( metrics={ "loss": "ptl/val_loss", "mean_accuracy": "ptl/val_accuracy" }, filename="checkpoint", on="validation_end") ]) if checkpoint_dir: # Currently, this leads to errors: # model = LightningMNISTClassifier.load_from_checkpoint( # os.path.join(checkpoint, "checkpoint")) # Workaround: ckpt = pl_load( os.path.join(checkpoint_dir, "checkpoint"), map_location=lambda storage, loc: storage) model = LightningMNISTClassifier._load_model_state( ckpt, config=config, data_dir=data_dir) trainer.current_epoch = ckpt["epoch"] else: model = LightningMNISTClassifier(config=config, data_dir=data_dir) trainer.fit(model)
def main(args): model_name = 'NART' logger = TensorBoardLogger('ckpt', model_name) dir_path = f'{logger.save_dir}/{model_name}/version_{logger.version}/' file_name = '{epoch}-{val_loss: .4f}' model_checkpoint = ModelCheckpoint(dirpath=dir_path, filename=file_name, monitor='val_loss', verbose=True, save_top_k=5) trainer = pl.Trainer( logger=logger, checkpoint_callback=model_checkpoint, # profiler=AdvancedProfiler('profile'), gradient_clip_val=5.0, gpus=[1], precision=16, amp_level='O1', amp_backend='native', reload_dataloaders_every_epoch=True, max_epochs=500, min_epochs=500, weights_summary=None, accumulate_grad_batches=4, resume_from_checkpoint=args.resume_from, flush_logs_every_n_steps=5000, benchmark=True, deterministic=True, log_every_n_steps=5000, limit_val_batches=1.0, ) data = DataModule() model = NART(vars(args)) # model = model.load_from_checkpoint('ckpt/NART/version_49/epoch=3-val_loss= 0.5988.ckpt', strict=False) trainer.fit(model=model, datamodule=data)
def main(config_path): seed_everything(42) initializer = Initializer(None) initializer.load_from_yaml(config_path) config = initializer.config train_loader = initializer.get_train_dataloader() val_loader = initializer.get_dev_dataloader() model = initializer.get_lightning_model() model_name = config.model['class'].split('.')[-1] logger = TensorBoardLogger(**config.logger_ckpt, name=model_name) file_path = f'{logger.save_dir}/{model_name}/version_{logger.version}/' + '{epoch}-{val_loss: .4f}-{val_mer: .4f}' model_checkpoint = ModelCheckpoint(filepath=file_path, monitor='val_loss', verbose=True, save_top_k=2) trainer = Trainer( **config.trainer, checkpoint_callback=model_checkpoint, logger=logger, profiler=True, ) trainer.fit(model, train_dataloader=train_loader, val_dataloaders=val_loader)
def main(conf: Conf): model = Net( conf.to_hparams(), reduce_lr=conf.reduce_lr, ) logger = TensorBoardLogger( conf.save_dir, name='{}'.format(cell_name), version='split_{}_seed_{}_{}'.format(split_number, conf.seed, str(int(time()))) ) # Copy this script to log_dir log_dir = Path(logger.log_dir) log_dir.mkdir(exist_ok=True, parents=True) shutil.copy(Path(__file__), log_dir) shutil.copy(Path(root + "/src/train_regressor.py"), log_dir) shutil.copy(Path(root + "/src/create_pytorch_graphs.py"), log_dir) shutil.copy(Path(root + "/src/regressor.py"), log_dir) shutil.copy(Path(root + "/src/train_test_split.py"), log_dir) trainer = pl.Trainer( max_epochs=conf.epochs, gpus=[0], logger=logger, resume_from_checkpoint=conf.ckpt_path, #load from checkpoint instead of resume weights_summary='top', deterministic = True, auto_lr_find = False, #precision = 16 ) trainer.fit(model) trainer.test()