def main(dataset, gpus): batch_size = 64 data = NABTraf(batch_size=batch_size, data_path=dataset) net = TadGAN(in_size=1, weight_decay=1e-6, iterations_critic=5, lr=0.0005, use_gru=True) net.example_input_array = torch.ones(batch_size, 100, 1, dtype=torch.float) logger = TensorBoardLogger('logs', name='tadgan', log_graph=True) # early_stop_callback = EarlyStopping( # monitor='F1', # min_delta=0.00, # patience=3, # verbose=True, # mode='max' # ) trainer = pl.Trainer( plugins=[DDPPlugin(find_unused_parameters=True)], fast_dev_run=False, weights_summary='full', log_gpu_memory=True, gpus=gpus, accelerator='ddp', logger=logger, check_val_every_n_epoch=5, max_epochs=100, callbacks=[ GPUStatsMonitor(), # early_stop_callback ]) trainer.fit(net, datamodule=data)
def test_gpu_stats_monitor_no_queries(tmpdir): """ Test GPU logger doesn't fail if no "nvidia-smi" queries are to be performed. """ model = BoringModel() gpu_stats = GPUStatsMonitor( memory_utilization=False, gpu_utilization=False, intra_step_time=True, inter_step_time=True, ) trainer = Trainer( default_root_dir=tmpdir, max_epochs=1, limit_train_batches=2, limit_val_batches=0, log_every_n_steps=1, gpus=1, callbacks=[gpu_stats], ) with mock.patch( "pytorch_lightning.loggers.tensorboard.TensorBoardLogger.log_metrics" ) as log_metrics_mock: trainer.fit(model) assert log_metrics_mock.mock_calls[2:] == [ mock.call({"batch_time/intra_step (ms)": mock.ANY}, step=0), mock.call({"batch_time/inter_step (ms)": mock.ANY}, step=1), mock.call({"batch_time/intra_step (ms)": mock.ANY}, step=1), ]
def get_trainer(args): pl.seed_everything(args.seed) # loggers root_dir = Path(args.default_root_dir).expanduser().resolve() root_dir.mkdir(parents=True, exist_ok=True) tb_save_dir = root_dir / "tb" tb_logger = TensorBoardLogger(save_dir=tb_save_dir) loggers = [tb_logger] logger.info(f"Run tensorboard --logdir {tb_save_dir}") # callbacks ckpt_cb = ModelCheckpoint(verbose=True) lr_cb = LearningRateMonitor(logging_interval="step") pb_cb = ProgressBar(refresh_rate=args.progress_bar_refresh_rate) callbacks = [lr_cb, pb_cb] callbacks.append(ckpt_cb) gpu_cb = GPUStatsMonitor() callbacks.append(gpu_cb) plugins = [] trainer = pl.Trainer.from_argparse_args(args, logger=loggers, callbacks=callbacks, plugins=plugins) return trainer
def test_gpu_stats_monitor(tmpdir): """ Test GPU stats are logged using a logger. """ model = EvalModelTemplate() gpu_stats = GPUStatsMonitor() logger = CSVLogger(tmpdir) trainer = Trainer( default_root_dir=tmpdir, max_epochs=1, gpus=1, callbacks=[gpu_stats], logger=logger ) results = trainer.fit(model) assert results path_csv = os.path.join(logger.log_dir, ExperimentWriter.NAME_METRICS_FILE) with open(path_csv, 'r') as fp: lines = fp.readlines() header = lines[0].split() fields = [ 'utilization.gpu', 'memory.used', 'memory.free', 'utilization.memory' ] for f in fields: assert any([f in h for h in header])
def test_gpu_stats_monitor(tmpdir): """Test GPU stats are logged using a logger.""" model = BoringModel() with pytest.deprecated_call(match="GPUStatsMonitor` callback was deprecated in v1.5"): gpu_stats = GPUStatsMonitor(intra_step_time=True) logger = CSVLogger(tmpdir) log_every_n_steps = 2 trainer = Trainer( default_root_dir=tmpdir, max_epochs=2, limit_train_batches=7, log_every_n_steps=log_every_n_steps, gpus=1, callbacks=[gpu_stats], logger=logger, ) trainer.fit(model) assert trainer.state.finished, f"Training failed with {trainer.state}" path_csv = os.path.join(logger.log_dir, ExperimentWriter.NAME_METRICS_FILE) met_data = np.genfromtxt(path_csv, delimiter=",", names=True, deletechars="", replace_space=" ") batch_time_data = met_data["batch_time/intra_step (ms)"] batch_time_data = batch_time_data[~np.isnan(batch_time_data)] assert batch_time_data.shape[0] == trainer.global_step // log_every_n_steps fields = ["utilization.gpu", "memory.used", "memory.free", "utilization.memory"] for f in fields: assert any(f in h for h in met_data.dtype.names)
def test_gpu_stats_monitor_cpu_machine(tmpdir): """Test GPUStatsMonitor on CPU machine.""" with pytest.raises( MisconfigurationException, match="NVIDIA driver is not installed"), pytest.deprecated_call( match="GPUStatsMonitor` callback was deprecated in v1.5"): GPUStatsMonitor()
def test_gpu_stats_monitor_cpu_machine(tmpdir): """ Test GPUStatsMonitor on CPU machine. """ with pytest.raises(MisconfigurationException, match='NVIDIA driver is not installed'): GPUStatsMonitor()
def cli_main(): pl.seed_everything(1234) # ------------ # args # ------------ parser = ArgumentParser() parser.add_argument('--batch_size', default=32, type=int) parser.add_argument('--hidden_dim', type=int, default=128) parser = pl.Trainer.add_argparse_args(parser) parser = LitClassifier.add_model_specific_args(parser) args = parser.parse_args() # ------------ # data # ------------ dataset = MNIST(_DATASETS_PATH, train=True, download=True, transform=transforms.ToTensor()) mnist_test = MNIST(_DATASETS_PATH, train=False, download=True, transform=transforms.ToTensor()) mnist_train, mnist_val = random_split(dataset, [55000, 5000]) train_loader = DataLoader(mnist_train, batch_size=args.batch_size) val_loader = DataLoader(mnist_val, batch_size=args.batch_size) test_loader = DataLoader(mnist_test, batch_size=args.batch_size) # ------------ # model # ------------ model = LitClassifier(Backbone(hidden_dim=args.hidden_dim), args.learning_rate) # ------------ # training # ------------ experiment_dir = Path.cwd() checkpoint_callback = ModelCheckpoint(dirpath=experiment_dir) trainer = pl.Trainer.from_argparse_args(args) trainer.fit(model, train_loader, val_loader,gradient_clip_val=0.5, benchmark=True, callbacks=[GPUStatsMonitor(),checkpoint_callback],) # ------------ # testing # ------------ result = trainer.test(test_dataloaders=test_loader, gradient_clip_val=0.5, benchmark=True, callbacks=[GPUStatsMonitor(),checkpoint_callback],) print(result)
def test_gpu_stats_monitor_parse_gpu_stats(): logs = GPUStatsMonitor._parse_gpu_stats([1, 2], [[3, 4, 5], [6, 7]], [("gpu", "a"), ("memory", "b")]) expected = { "device_id: 1/gpu (a)": 3, "device_id: 1/memory (b)": 4, "device_id: 2/gpu (a)": 6, "device_id: 2/memory (b)": 7, } assert logs == expected
def test_gpu_stats_monitor_no_gpu_warning(tmpdir): """Test GPUStatsMonitor raises a warning when not training on GPU device.""" model = BoringModel() with pytest.deprecated_call(match="GPUStatsMonitor` callback was deprecated in v1.5"): gpu_stats = GPUStatsMonitor() trainer = Trainer(default_root_dir=tmpdir, callbacks=[gpu_stats], max_steps=1, gpus=None) with pytest.raises(MisconfigurationException, match="not running on GPU"): trainer.fit(model)
def test_gpu_stats_monitor_no_logger(tmpdir): """Test GPUStatsMonitor with no logger in Trainer.""" model = BoringModel() with pytest.deprecated_call(match="GPUStatsMonitor` callback was deprecated in v1.5"): gpu_stats = GPUStatsMonitor() trainer = Trainer(default_root_dir=tmpdir, callbacks=[gpu_stats], max_epochs=1, gpus=1, logger=False) with pytest.raises(MisconfigurationException, match="Trainer that has no logger."): trainer.fit(model)
def test_gpu_stats_monitor_parse_gpu_stats(): logs = GPUStatsMonitor._parse_gpu_stats('1,2', [[3, 4, 5], [6, 7]], [('gpu', 'a'), ('memory', 'b')]) expected = { 'gpu_id: 1/gpu (a)': 3, 'gpu_id: 1/memory (b)': 4, 'gpu_id: 2/gpu (a)': 6, 'gpu_id: 2/memory (b)': 7 } assert logs == expected
def test_gpu_stats_monitor_no_gpu_warning(tmpdir): """ Test GPUStatsMonitor raises a warning when not training on GPU device. """ model = EvalModelTemplate() gpu_stats = GPUStatsMonitor() trainer = Trainer(default_root_dir=tmpdir, callbacks=[gpu_stats], max_steps=1, gpus=None) with pytest.raises(MisconfigurationException, match='not running on GPU'): trainer.fit(model)
def test_gpu_stats_monitor_no_logger(tmpdir): """ Test GPUStatsMonitor with no logger in Trainer. """ model = EvalModelTemplate() gpu_stats = GPUStatsMonitor() trainer = Trainer(default_root_dir=tmpdir, callbacks=[gpu_stats], max_epochs=1, gpus=1, logger=False) with pytest.raises(MisconfigurationException, match='Trainer that has no logger.'): trainer.fit(model)
def test_gpu_stats_monitor_no_gpu_warning(tmpdir): """ Test GPUStatsMonitor raises a warning when not training on GPU device. """ model = EvalModelTemplate() gpu_stats = GPUStatsMonitor() trainer = Trainer(default_root_dir=tmpdir, callbacks=[gpu_stats], max_steps=1, gpus=None) with pytest.warns( RuntimeWarning, match='not running on GPU. Logged utilization will be independent' ): trainer.fit(model)
def test_gpu_stats_monitor(tmpdir): """ Test GPU stats are logged using a logger. """ model = BoringModel() gpu_stats = GPUStatsMonitor(intra_step_time=True) logger = CSVLogger(tmpdir) log_every_n_steps = 2 trainer = Trainer(default_root_dir=tmpdir, max_epochs=2, limit_train_batches=7, log_every_n_steps=log_every_n_steps, gpus=1, callbacks=[gpu_stats], logger=logger) trainer.fit(model) assert trainer.state == TrainerState.FINISHED, f"Training failed with {trainer.state}" path_csv = os.path.join(logger.log_dir, ExperimentWriter.NAME_METRICS_FILE) met_data = np.genfromtxt(path_csv, delimiter=',', names=True, deletechars='', replace_space=' ') batch_time_data = met_data['batch_time/intra_step (ms)'] batch_time_data = batch_time_data[~np.isnan(batch_time_data)] assert batch_time_data.shape[0] == trainer.global_step // log_every_n_steps fields = [ 'utilization.gpu', 'memory.used', 'memory.free', 'utilization.memory', ] for f in fields: assert any([f in h for h in met_data.dtype.names])
def main(hparams): exp_time = datetime.now().strftime("%Y%m%d_%H%M%S") tb_logger = TensorBoardLogger(save_dir='logs/', version=f'v_{exp_time}') gpu_stats = GPUStatsMonitor() data = IMDBDataModule(hparams=hparams) data.prepare_data() data.setup() model = RNN(input_dim=data.dims, hparams=hparams) trainer = Trainer(logger=tb_logger, gpus=hparams.gpus, max_epochs=hparams.max_epochs, callbacks=[gpu_stats]) trainer.fit(model, data) trainer.test(datamodule=data) torch.save(trainer.model.state_dict(), f'models/model_{exp_time}.pth')
def objective(trial): if hparams.version is None: hparams.version = str(uuid1()) # main LightningModule pretrain_system = PreTrainSystem( learning_rate=trial.suggest_loguniform("learning_rate", 1e-5, 1e-2), beta_1=hparams.beta_1, beta_2=hparams.beta_2, weight_decay=trial.suggest_uniform("weight_decay", 1e-5, 1e-2), optimizer=hparams.optimizer, batch_size=hparams.batch_size, multiplier=hparams.multiplier, scheduler_patience=hparams.scheduler_patience, ) pretrain_checkpoints = ModelCheckpoint( dirpath=MODEL_CHECKPOINTS_DIR, monitor="Val/loss_epoch", verbose=True, mode="min", save_top_k=hparams.save_top_k, ) pretrain_early_stopping = EarlyStopping( monitor="Val/loss_epoch", min_delta=0.00, patience=hparams.patience, verbose=False, mode="min", ) pretrain_gpu_stats_monitor = GPUStatsMonitor(temperature=True) log_recoloring_to_tensorboard = LogPairRecoloringToTensorboard() optuna_pruning = PyTorchLightningPruningCallback(monitor="Val/loss_epoch", trial=trial) logger = TensorBoardLogger( S3_LIGHTNING_LOGS_DIR, name=hparams.name, version=hparams.version, log_graph=True, default_hp_metric=False, ) trainer = Trainer.from_argparse_args( hparams, logger=logger, checkpoint_callback=pretrain_checkpoints, callbacks=[ pretrain_early_stopping, log_recoloring_to_tensorboard, pretrain_gpu_stats_monitor, optuna_pruning, ], profiler="simple", ) datamodule = PreTrainDataModule( batch_size=pretrain_system.hparams.batch_size, multiplier=pretrain_system.hparams.multiplier, shuffle=hparams.shuffle, num_workers=hparams.num_workers, size=hparams.size, pin_memory=hparams.pin_memory, train_batch_from_same_image=hparams.train_batch_from_same_image, val_batch_from_same_image=hparams.val_batch_from_same_image, test_batch_from_same_image=hparams.test_batch_from_same_image, ) # trainer.tune(pretrain_system, datamodule=datamodule) trainer.fit(pretrain_system, datamodule=datamodule) # get best checkpoint best_model_path = pretrain_checkpoints.best_model_path pretrain_system = PreTrainSystem.load_from_checkpoint(best_model_path) test_result = trainer.test(pretrain_system, datamodule=datamodule) pretrain_system.hparams.test_metric_name = test_result[0]["Test/loss_epoch"] logger.log_hyperparams(pretrain_system.hparams) logger.finalize(status="success") # upload best model to S3 S3_best_model_path = os.path.join( S3_MODEL_CHECKPOINTS_RELATIVE_DIR, hparams.name, ".".join([hparams.version, best_model_path.split(".")[-1]]), ) upload_to_s3(best_model_path, S3_best_model_path) return test_result[0]["Test/loss_epoch"]
def main(hparams): if hparams.logging_location == "s3": logging_dir = os.path.join(S3_LIGHTNING_LOGS_DIR, hparams.name) else: logging_dir = os.path.join(LIGHTNING_LOGS_DIR, hparams.name) # main LightningModule if hparams.checkpoint_path is not None: pretrain_system = PreTrainSystem.load_from_checkpoint( hparams.adversarial_system) else: pretrain_system = PreTrainSystem(**vars(hparams)) pretrain_checkpoints = ModelCheckpoint( dirpath=os.path.join(MODEL_CHECKPOINTS_DIR, hparams.version), monitor="Val/loss", verbose=True, mode="min", save_top_k=hparams.save_top_k, ) pretrain_early_stopping = EarlyStopping( monitor="Val/loss", min_delta=0.00, patience=hparams.patience, verbose=False, mode="min", ) gpu_stats = GPUStatsMonitor(temperature=True) log_recolored_to_tensorboard = LogPairRecoloringToTensorboard() log_hyperparams_to_tensorboard = LogHyperparamsToTensorboard( hp_metric="Test/loss") notify = Notify(test_metric_name="Test/loss") logger = TensorBoardLogger( logging_dir, name=hparams.name, version=hparams.version, log_graph=True, default_hp_metric=False, ) trainer = Trainer.from_argparse_args( hparams, resume_from_checkpoint=hparams.checkpoint_path, logger=logger, checkpoint_callback=pretrain_checkpoints, callbacks=[ pretrain_early_stopping, log_recolored_to_tensorboard, log_hyperparams_to_tensorboard, gpu_stats, notify, ], profiler="simple", benchmark=True, ) datamodule = PreTrainDataModule(**vars(hparams)) trainer.fit(pretrain_system, datamodule=datamodule) # lightning automatically uses the best model checkpoint for testing trainer.test(pretrain_system, datamodule=datamodule) if hparams.upload_model_to_s3: # upload best model to S3 best_model_path = pretrain_checkpoints.best_model_path S3_best_model_path = os.path.join( S3_MODEL_CHECKPOINTS_RELATIVE_DIR, hparams.name, ".".join([hparams.version, best_model_path.split(".")[-1]]), ) upload_to_s3(best_model_path, S3_best_model_path)
def train(cfg: DictConfig) -> None: """ Train a model for image classification Args: cfg: hydra configuration """ # Load pre-existing config file if os.path.exists("config.yaml"): logging.info("Loading pre-existing config file") cfg = OmegaConf.load("config.yaml") else: # copy initial config to a separate file to avoid overwriting it # when hydra resumes training and initializes again shutil.copy2(".hydra/config.yaml", "config.yaml") # Check for checkpoint ckpt_path = os.path.join(os.getcwd(), cfg.checkpoint.params.dirpath, "last.ckpt") if os.path.exists(ckpt_path): logging.info(f"Loading existing checkpoint @ {ckpt_path}") else: logging.info("No existing ckpt found. Training from scratch") ckpt_path = None # Display configuration logger.info(OmegaConf.to_yaml(cfg)) # Seed everything seed_everything(cfg.training.seed) # Load datamodule data = DataModule(cfg) # Callbacks callbacks = [ CustomModelCheckpoint(**cfg.checkpoint.params), LearningRateMonitor(), LitProgressBar(), ] if cfg.trainer.params.gpus: callbacks.append(GPUStatsMonitor()) # Logger trainer_logger = load_obj(cfg.logger.class_name)(**cfg.logger.params) # Load model model = load_obj(cfg.model.class_name)(cfg) # Save model id with open("id", "w") as f: f.write(cfg.id) # Instantiate trainer trainer = Trainer( resume_from_checkpoint=ckpt_path, callbacks=callbacks, logger=trainer_logger, **cfg.trainer.params, ) # Display model architecture alongside parameters and data logger.info(model) logger.info(data) logger.info(f"random seed: {cfg.training.seed}") # Fit trainer trainer.fit(model, datamodule=data)
def main( experiment_type: str, affordance_type: AffordanceType, ngrams: int, return_words: int, definition_length: int, model_type: str, model_path: Optional[str], save_path: Optional[str], gpus: int, batch_size: int, learning_rate: float, fix_valid_set: bool, evaluation_only: bool, notebook: bool, ): # Device if gpus > 0 and not torch.cuda.is_available(): gpus = 0 warn('GPU is not available on this machine. Using CPU instead.') device = torch.device('cuda') if gpus > 0 else torch.device('cpu') # Training data train_set = PiqaDataset("train", fix=fix_valid_set) valid_set = PiqaDataset("valid", fix=fix_valid_set) test_set = PiqaDataset("test", fix=fix_valid_set) # Model & Tokenizer try: model = PIQAModel.get(model_type)(learning_rate=learning_rate, model_type=model_type) tokenizer = PIQATokenizer.get(model_type)(experiment_type, ngrams, return_words, definition_length, affordance_type, model_type, tqdm_arg=notebook) except TypeError: raise RuntimeError(f'{model_type} has not been implemented.') # Load finetuned weights if model_path is not None: model.load_state_dict(torch.load(model_path)) # Just evaluation if evaluation_only: model.eval() else: model.train() # Pre-tokenize data sets collate_fn = lambda x: tokenizer.collate_fn( x, pad_token=tokenizer.pad_token_id) if tokenizer._type == 'affordance': all_sets_path = Path( f'./data/{tokenizer._type}_{ngrams}_{return_words}_{definition_length}_{affordance_type}.pkl' ) elif tokenizer._type == 'definition': all_sets_path = Path( f'./data/{tokenizer._type}_{ngrams}_{return_words}_{definition_length}.pkl' ) else: all_sets_path = Path(f'./data/{tokenizer._type}.pkl') if all_sets_path.exists(): with open(all_sets_path, 'rb') as f: all_sets = pickle.load(f) train_set = all_sets['train'] test_set = all_sets['test'] valid_set = all_sets['valid'] else: train_set = tokenizer.pretokenize_data_set(train_set) valid_set = tokenizer.pretokenize_data_set(valid_set) test_set = tokenizer.pretokenize_data_set(test_set) with open(all_sets_path, 'wb') as f: pickle.dump( { 'train': train_set, 'test': test_set, 'valid': valid_set }, f) valid_set = tokenizer.tokenize_data_set(valid_set) test_set = tokenizer.tokenize_data_set(test_set) train_set = tokenizer.tokenize_data_set(train_set) trainloader = DataLoader(train_set, shuffle=True, collate_fn=collate_fn, batch_size=batch_size) validloader = DataLoader(valid_set, shuffle=False, collate_fn=collate_fn, batch_size=batch_size) testloader = DataLoader(test_set, shuffle=False, collate_fn=collate_fn) # Load callbacks callbacks = [] callbacks.append( EarlyStopping('val_accuracy', min_delta=0.001, patience=5, mode='max', verbose=True)) if save_path is not None: callbacks.append( ModelCheckpoint(save_path, filename='{epoch}-{val_loss:.2f}')) if gpus > 0: callbacks.append( GPUStatsMonitor(True, False, False, False, False, False)) # Training trainer = pl.Trainer(gpus=gpus, auto_scale_batch_size=False, callbacks=callbacks) trainer.fit(model, trainloader, validloader) print("Finished Training")
def main(hparams): if hparams.checkpoints_location == "s3": checkpoints_dir = os.path.join(S3_MODEL_CHECKPOINTS_DIR, hparams.name, hparams.version) else: checkpoints_dir = os.path.join(MODEL_CHECKPOINTS_DIR, hparams.name, hparams.version) if hparams.logging_location == "s3": logging_dir = os.path.join(S3_LIGHTNING_LOGS_DIR, hparams.name) else: logging_dir = os.path.join(LIGHTNING_LOGS_DIR, hparams.name) # load generator pretrained with PreTrainSystem generator = PreTrainSystem.load_from_checkpoint( PRETRAINED_MODEL_CHECKPOINT_PATH).generator # main LightningModule if hparams.checkpoint_path is not None: adversarial_system = AdversarialMSESystem.load_from_checkpoint( hparams.checkpoint_path) else: adversarial_system = AdversarialMSESystem(**vars(hparams), generator=generator) adversarial_checkpoints = ModelCheckpoint( dirpath=checkpoints_dir, monitor="Val/adv_loss", verbose=True, mode="min", save_top_k=-1, ) gpu_stats = GPUStatsMonitor(temperature=True) log_recolored_to_tensorboard = LogAdversarialMSEToTensorboard() log_hyperparams_to_tensorboard = LogHyperparamsToTensorboard( hp_metric=None) notify = Notify() logger = TensorBoardLogger( logging_dir, name=hparams.name, version=hparams.version, log_graph=True, default_hp_metric=False, ) # trainer trainer = Trainer.from_argparse_args( hparams, resume_from_checkpoint=hparams.checkpoint_path, logger=logger, checkpoint_callback=adversarial_checkpoints, callbacks=[ log_recolored_to_tensorboard, log_hyperparams_to_tensorboard, gpu_stats, notify, ], profiler="simple", benchmark=True, enable_pl_optimizer=True, ) datamodule = GANDataModule(**vars(hparams)) trainer.fit(adversarial_system, datamodule=datamodule) # lightning automatically uses the best model checkpoint for testing trainer.test(adversarial_system, datamodule=datamodule)
def build(self,**kwargs): """ Reponsável por criar os argumentos da classe """ # Checagem das Chamadas self.build_called = True # Rcuperando Caminhos self.data_dirpath = self.config['dirpaths']['data_dirpath'] self.log_dirpath = self.config['dirpaths']['log_dirpath'] self.cwd_dirpath = self.config['dirpaths']['cwd_dirpath'] # Rcuperando Parâmetros self.hparams = self.config['params']['hparams'] self.lightning_params = self.config['params']['lightning_params'] self.early_stop_callback_params = self.config['params']['early_stop_callback_params'] self.prepare_data_params = self.config['params']['prepare_data_params'] #- self.test_size_from_dev = self.prepare_data_params['test_size_from_dev'] #- self.model_name = self.hparams['model_name'] self.num_gen_sentences = self.hparams['num_gen_sentences'] self.no_repeat_ngram_size = self.hparams['no_repeat_ngram_size'] self.train_batch_size = self.hparams['train_batch_size'] self.eval_batch_size = self.hparams['eval_batch_size'] self.source_max_length = self.hparams['source_max_length'] self.target_max_length = self.hparams['target_max_length'] self.temperature = self.hparams['temperature'] self.top_p = self.hparams['top_p'] self.learning_rate = self.hparams['learning_rate'] self.eps = self.hparams['eps'] self.seed = self.hparams['seed'] #- self.num_gpus = self.lightning_params['num_gpus'] if torch.cuda.is_available() else 0 self.profiler = self.lightning_params['profiler'] self.max_epochs = self.lightning_params['max_epochs'] self.accumulate_grad_batches = self.lightning_params['accumulate_grad_batches'] self.check_val_every_n_epoch = self.lightning_params['check_val_every_n_epoch'] self.progress_bar_refresh_rate = self.lightning_params['progress_bar_refresh_rate'] self.gradient_clip_val = self.lightning_params['gradient_clip_val'] self.fast_dev_run = self.lightning_params['fast_dev_run'] #- self.monitor = self.early_stop_callback_params['monitor'] self.min_delta = self.early_stop_callback_params['min_delta'] self.patience = self.early_stop_callback_params['patience'] self.verbose = self.early_stop_callback_params['verbose'] self.mode = self.early_stop_callback_params['mode'] # Criando parâmetros adicionais self.tokenizer = T5Tokenizer.from_pretrained(self.config['params']['hparams']['model_name']) self.device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') self.MODEL = None # Trainer if self.fast_dev_run: self.TRAINER = pl.Trainer( gpus=self.num_gpus, checkpoint_callback=False, fast_dev_run=True # Disable checkpoint saving. ) else: checkpoint_callback = ModelCheckpoint( dirpath=self.data_dirpath, save_top_k=-1 ) early_stop_callback = EarlyStopping( monitor=self.early_stop_callback_params['monitor'], min_delta=self.early_stop_callback_params['min_delta'], patience=self.early_stop_callback_params['patience'], verbose=self.early_stop_callback_params['verbose'], mode=self.early_stop_callback_params['mode'] ) callbacks = [early_stop_callback,checkpoint_callback] if self.num_gpus>0: gpu_stats = GPUStatsMonitor() callbacks.append(gpu_stats) tb_logger = pl.loggers.TensorBoardLogger(f"{self.log_dirpath}") else: tb_logger = None self.TRAINER = pl.Trainer( gpus= self.lightning_params['num_gpus'], profiler=self.lightning_params['profiler'], max_epochs=self.lightning_params['max_epochs'], accumulate_grad_batches = self.lightning_params['accumulate_grad_batches'], check_val_every_n_epoch=self.lightning_params['check_val_every_n_epoch'], progress_bar_refresh_rate=self.lightning_params['progress_bar_refresh_rate'], callbacks = callbacks, resume_from_checkpoint=None, logger = tb_logger )
def test_gpu_stats_monitor_get_gpu_ids_cuda_visible_devices_integers( device_count_mock, is_available_mock): gpu_ids = GPUStatsMonitor._get_gpu_ids([1, 2]) expected = ["2", "4"] assert gpu_ids == expected
def test_gpu_stats_monitor_get_gpu_ids_cuda_visible_devices_unset( device_count_mock, is_available_mock): gpu_ids = GPUStatsMonitor._get_gpu_ids([1, 0]) expected = ["1", "0"] assert gpu_ids == expected
def create_lightning_trainer(container: LightningContainer, resume_from_checkpoint: Optional[Path] = None, num_nodes: int = 1, multiple_trainloader_mode: str = "max_size_cycle") -> \ Tuple[Trainer, StoringLogger]: """ Creates a Pytorch Lightning Trainer object for the given model configuration. It creates checkpoint handlers and loggers. That includes a diagnostic logger for use in unit tests, that is also returned as the second return value. :param container: The container with model and data. :param resume_from_checkpoint: If provided, training resumes from this checkpoint point. :param num_nodes: The number of nodes to use in distributed training. :return: A tuple [Trainer object, diagnostic logger] """ logging.debug(f"resume_from_checkpoint: {resume_from_checkpoint}") num_gpus = container.num_gpus_per_node() effective_num_gpus = num_gpus * num_nodes strategy = None if effective_num_gpus == 0: accelerator = "cpu" devices = 1 message = "CPU" else: accelerator = "gpu" devices = num_gpus message = f"{devices} GPU" if effective_num_gpus > 1: # Accelerator should be "ddp" when running large models in AzureML (when using DDP_spawn, we get out of # GPU memory). # Initialize the DDP plugin. The default for pl_find_unused_parameters is False. If True, the plugin # prints out lengthy warnings about the performance impact of find_unused_parameters. strategy = DDPPlugin(find_unused_parameters=container.pl_find_unused_parameters) message += "s per node with DDP" logging.info(f"Using {message}") tensorboard_logger = TensorBoardLogger(save_dir=str(container.logs_folder), name="Lightning", version="") loggers = [tensorboard_logger, AzureMLLogger(False)] storing_logger = StoringLogger() loggers.append(storing_logger) # Use 32bit precision when running on CPU. Otherwise, make it depend on use_mixed_precision flag. precision = 32 if num_gpus == 0 else 16 if container.use_mixed_precision else 32 # The next two flags control the settings in torch.backends.cudnn.deterministic and torch.backends.cudnn.benchmark # https://pytorch.org/docs/stable/notes/randomness.html # Note that switching to deterministic models can have large performance downside. if container.pl_deterministic: deterministic = True benchmark = False else: deterministic = False benchmark = True # The last checkpoint is considered the "best" checkpoint. For large segmentation # models, this still appears to be the best way of choosing them because validation loss on the relatively small # training patches is not stable enough. Going by the validation loss somehow works for the Prostate model, but # not for the HeadAndNeck model. # Note that "last" is somehow a misnomer, it should rather be "latest". There is a "last" checkpoint written in # every epoch. We could use that for recovery too, but it could happen that the job gets preempted right during # writing that file, and we would end up with an invalid file. last_checkpoint_callback = ModelCheckpoint(dirpath=str(container.checkpoint_folder), save_last=True, save_top_k=0) recovery_checkpoint_callback = ModelCheckpoint(dirpath=str(container.checkpoint_folder), filename=AUTOSAVE_CHECKPOINT_FILE_NAME, every_n_val_epochs=container.autosave_every_n_val_epochs, save_last=False) callbacks: List[Callback] = [ last_checkpoint_callback, recovery_checkpoint_callback, ] if container.monitor_loading: # TODO antonsc: Remove after fixing the callback. raise NotImplementedError("Monitoring batch loading times has been temporarily disabled.") # callbacks.append(BatchTimeCallback()) if num_gpus > 0 and container.monitor_gpu: logging.info("Adding monitoring for GPU utilization") callbacks.append(GPUStatsMonitor(intra_step_time=True, inter_step_time=True)) # Add the additional callbacks that were specified in get_trainer_arguments for LightningContainers additional_args = container.get_trainer_arguments() # Callbacks can be specified via the "callbacks" argument (the legacy behaviour) or the new get_callbacks method if "callbacks" in additional_args: more_callbacks = additional_args.pop("callbacks") if isinstance(more_callbacks, list): callbacks.extend(more_callbacks) # type: ignore else: callbacks.append(more_callbacks) # type: ignore callbacks.extend(container.get_callbacks()) is_azureml_run = not is_offline_run_context(RUN_CONTEXT) progress_bar_refresh_rate = container.pl_progress_bar_refresh_rate if progress_bar_refresh_rate is None: progress_bar_refresh_rate = 50 logging.info(f"The progress bar refresh rate is not set. Using a default of {progress_bar_refresh_rate}. " f"To change, modify the pl_progress_bar_refresh_rate field of the container.") if is_azureml_run: callbacks.append(AzureMLProgressBar(refresh_rate=progress_bar_refresh_rate, write_to_logging_info=True, print_timestamp=False)) else: callbacks.append(TQDMProgressBar(refresh_rate=progress_bar_refresh_rate)) # Read out additional model-specific args here. # We probably want to keep essential ones like numgpu and logging. trainer = Trainer(default_root_dir=str(container.outputs_folder), deterministic=deterministic, benchmark=benchmark, accelerator=accelerator, strategy=strategy, max_epochs=container.num_epochs, # Both these arguments can be integers or floats. If integers, it is the number of batches. # If float, it's the fraction of batches. We default to 1.0 (processing all batches). limit_train_batches=container.pl_limit_train_batches or 1.0, limit_val_batches=container.pl_limit_val_batches or 1.0, num_sanity_val_steps=container.pl_num_sanity_val_steps, check_val_every_n_epoch=container.pl_check_val_every_n_epoch, callbacks=callbacks, logger=loggers, num_nodes=num_nodes, devices=devices, precision=precision, sync_batchnorm=True, detect_anomaly=container.detect_anomaly, profiler=container.pl_profiler, resume_from_checkpoint=str(resume_from_checkpoint) if resume_from_checkpoint else None, multiple_trainloader_mode=multiple_trainloader_mode, **additional_args) return trainer, storing_logger
def test_gpu_stats_monitor_get_gpu_ids_cuda_visible_devices_uuids( device_count_mock, is_available_mock): gpu_ids = GPUStatsMonitor._get_gpu_ids([1, 2]) expected = ["GPU-56d78e9f", "GPU-02a46c8e"] assert gpu_ids == expected
def training_loop( run_dir='.', # Output directory. training_set_kwargs={}, # Options for training set. data_loader_kwargs={}, # Options for torch.utils.data.DataLoader. G_kwargs={}, # Options for generator network. D_kwargs={}, # Options for discriminator network. G_opt_kwargs={}, # Options for generator optimizer. D_opt_kwargs={}, # Options for discriminator optimizer. augment_kwargs=None, # Options for augmentation pipeline. None = disable. loss_kwargs={}, # Options for loss function. metrics=[], # Metrics to evaluate during training. random_seed=0, # Global random seed. num_gpus=1, # Number of GPUs participating in the training. #rank = 0, # Rank of the current process in [0, num_gpus[. batch_size=4, # Total batch size for one training iteration. Can be larger than batch_gpu * num_gpus. batch_gpu=4, # Number of samples processed at a time by one GPU. ema_kimg=10, # Half-life of the exponential moving average (EMA) of generator weights. ema_rampup=None, # EMA ramp-up coefficient. G_reg_interval=4, # How often to perform regularization for G? None = disable lazy regularization. D_reg_interval=16, # How often to perform regularization for D? None = disable lazy regularization. augment_p=0, # Initial value of augmentation probability. ada_target=None, # ADA target value. None = fixed p. ada_interval=4, # How often to perform ADA adjustment? ada_kimg=500, # ADA adjustment speed, measured in how many kimg it takes for p to increase/decrease by one unit. total_kimg=25000, # Total length of the training, measured in thousands of real images. kimg_per_tick=4, # Progress snapshot interval. image_snapshot_ticks=50, # How often to save image snapshots? None = disable. network_snapshot_ticks=50, # How often to save network snapshots? None = disable. resume_pkl=None, # Network pickle to resume training from. cudnn_benchmark=True, # Enable torch.backends.cudnn.benchmark? allow_tf32=False, # Enable torch.backends.cuda.matmul.allow_tf32 and torch.backends.cudnn.allow_tf32? abort_fn=None, # Callback function for determining whether to abort training. Must return consistent results across ranks. progress_fn=None, # Callback function for updating training progress. Called for all ranks. ): # Initialize. start_time = time.time() #device = torch.device('cuda', rank) #np.random.seed(random_seed * num_gpus + rank) #torch.manual_seed(random_seed * num_gpus + rank) #torch.backends.cudnn.benchmark = cudnn_benchmark # Improves training speed. seed_everything(random_seed) torch.backends.cuda.matmul.allow_tf32 = allow_tf32 # Allow PyTorch to internally use tf32 for matmul torch.backends.cudnn.allow_tf32 = allow_tf32 # Allow PyTorch to internally use tf32 for convolutions conv2d_gradfix.enabled = True # Improves training speed. grid_sample_gradfix.enabled = True # Avoids errors with the augmentation pipe. # Load training set. # if rank == 0: # print('Loading training set...') # training_set = dnnlib.util.construct_class_by_name(**training_set_kwargs) # subclass of training.dataset.Dataset # training_set_sampler = misc.InfiniteSampler(dataset=training_set, rank=rank, num_replicas=num_gpus, seed=random_seed) # training_set_iterator = iter(torch.utils.data.DataLoader(dataset=training_set, sampler=training_set_sampler, batch_size=batch_size//num_gpus, **data_loader_kwargs)) # if rank == 0: # print() # print('Num images: ', len(training_set)) # print('Image shape:', training_set.image_shape) # print('Label shape:', training_set.label_shape) # print() # Construct networks. # if rank == 0: # print('Constructing networks...') training_set_pl = StyleGANDataModule(batch_gpu, training_set_kwargs, data_loader_kwargs) training_set = training_set_pl.training_set common_kwargs = dict(c_dim=training_set.label_dim, img_resolution=training_set.resolution, img_channels=training_set.num_channels) G = dnnlib.util.construct_class_by_name( **G_kwargs, **common_kwargs) # subclass of torch.nn.Module D = dnnlib.util.construct_class_by_name( **D_kwargs, **common_kwargs) # subclass of torch.nn.Module # # Resume from existing pickle. # if (resume_pkl is not None) and (rank == 0): # print(f'Resuming from "{resume_pkl}"') # with dnnlib.util.open_url(resume_pkl) as f: # resume_data = legacy.load_network_pkl(f) # for name, module in [('G', G), ('D', D), ('G_ema', G_ema)]: # misc.copy_params_and_buffers(resume_data[name], module, require_all=False) # # Print network summary tables. # if rank == 0: # z = torch.empty([batch_gpu, G.z_dim], device=device) # c = torch.empty([batch_gpu, G.c_dim], device=device) # img = misc.print_module_summary(G, [z, c]) # misc.print_module_summary(D, [img, c]) # Setup augmentation. # if rank == 0: # print('Setting up augmentation...') augment_pipe = None ada_stats = None if (augment_kwargs is not None) and (augment_p > 0 or ada_target is not None): augment_pipe = dnnlib.util.construct_class_by_name( **augment_kwargs) # subclass of torch.nn.Module augment_pipe.p.copy_(torch.as_tensor(augment_p)) # if ada_target is not None: # ada_stats = training_stats.Collector(regex='Loss/signs/real') fid50k = FID(max_real=None, num_gen=50000) ema_kimg /= num_gpus ada_kimg /= num_gpus kimg_per_tick /= num_gpus gpu_stats = GPUStatsMonitor(intra_step_time=True) net = StyleGAN2(G=G, D=D, G_opt_kwargs=G_opt_kwargs, D_opt_kwargs=D_opt_kwargs, augment_pipe=augment_pipe, datamodule=training_set_pl, G_reg_interval=G_reg_interval, D_reg_interval=D_reg_interval, ema_kimg=ema_kimg, ema_rampup=ema_rampup, ada_target=ada_target, ada_interval=ada_interval, ada_kimg=ada_kimg, metrics=[fid50k], kimg_per_tick=kimg_per_tick, random_seed=random_seed, **loss_kwargs) trainer = pl.Trainer(gpus=num_gpus, accelerator='ddp', weights_summary='full', fast_dev_run=10, benchmark=cudnn_benchmark, max_steps=total_kimg // (batch_size) * 1000, plugins=[ DDPPlugin(broadcast_buffers=False, find_unused_parameters=True) ], callbacks=[gpu_stats], accumulate_grad_batches=num_gpus) trainer.fit(net, datamodule=training_set_pl)