def test_omegaconf(self): """Ensure omegaconf raises an error when an unexcepted argument is passed""" with pytest.raises(OmegaConfBaseException): exp_manager(pl.Trainer(accelerator='cpu'), {"unused": 1})
def main(cfg) -> None: logging.info("\n\n************** Experiment configuration ***********") logging.info(f'\n{OmegaConf.to_yaml(cfg)}') megatron_amp_o2 = cfg.model.get('megatron_amp_O2', False) plugins = [ NLPDDPPlugin( no_ddp_communication_hook=True, gradient_as_bucket_view=cfg.model.gradient_as_bucket_view, find_unused_parameters=False, ) ] if cfg.trainer.precision in [16, 'bf16']: scaler = None if cfg.trainer.precision == 16: scaler = GradScaler( init_scale=cfg.model.get('native_amp_init_scale', 2 ** 32), growth_interval=cfg.model.get('native_amp_growth_interval', 1000), hysteresis=cfg.model.get('hysteresis', 2), ) if megatron_amp_o2: plugins.append(MegatronHalfPrecisionPlugin(precision=cfg.trainer.precision, device='cuda', scaler=scaler)) else: plugins.append(PipelineMixedPrecisionPlugin(precision=cfg.trainer.precision, device='cuda', scaler=scaler)) if cfg.get('cluster_type', None) == 'BCP': plugins.append(TorchElasticEnvironment()) trainer = Trainer(plugins=plugins, **cfg.trainer, callbacks=[ModelSummary(max_depth=3)]) # tokenizers will be trained and and tarred training data will be created if needed # model config is then updated if cfg.model.preproc_out_dir is not None: MTDataPreproc(cfg=cfg.model, trainer=trainer) exp_manager(trainer, cfg.exp_manager) # update resume from checkpoint found by exp_manager if cfg.model.resume_from_checkpoint is not None: resume_from_checkpoint = cfg.model.resume_from_checkpoint else: resume_from_checkpoint = trainer._checkpoint_connector.resume_from_checkpoint_fit_path logging.info(f'Resuming training from checkpoint: {resume_from_checkpoint}') trainer._checkpoint_connector = CheckpointConnector(trainer, resume_from_checkpoint=resume_from_checkpoint) # Override timer callback to a stateless one for idx, callback in enumerate(trainer.callbacks): if isinstance(callback, Timer): trainer.callbacks[idx] = StatelessTimer(cfg.trainer.max_time,) # hydra interpolation does not work here as the interpolation key is lost when PTL saves hparams with open_dict(cfg): cfg.model.precision = cfg.trainer.precision if hasattr(cfg.model, 'pretrained_model_path') and cfg.model.pretrained_model_path is not None: if not hasattr(cfg.model, 'pretrained_model_type'): raise ValueError(f"Pretrained model type must be in [T5, BART].") assert cfg.model.pretrained_model_type in ['T5', 'BART'] if cfg.model.pretrained_model_type == 'T5': pretrained_cfg = MegatronT5Model.restore_from( cfg.model.pretrained_model_path, trainer=trainer, return_config=True ) else: pretrained_cfg = MegatronBARTModel.restore_from( cfg.model.pretrained_model_path, trainer=trainer, return_config=True ) OmegaConf.set_struct(pretrained_cfg, True) with open_dict(pretrained_cfg): pretrained_cfg.masked_softmax_fusion = False # Set source and target language/multilingual pretrained_cfg.src_language = cfg.model.src_language pretrained_cfg.tgt_language = cfg.model.tgt_language pretrained_cfg.multilingual = cfg.model.multilingual pretrained_cfg.shared_tokenizer = True # Max generation delta pretrained_cfg.max_generation_delta = cfg.model.max_generation_delta # Set label smoothing pretrained_cfg.label_smoothing = cfg.model.label_smoothing # Set tokenizer paths: pretrained_cfg.encoder_tokenizer = pretrained_cfg.tokenizer pretrained_cfg.decoder_tokenizer = pretrained_cfg.tokenizer # Pre-trained models should use the legacy sentencepiece tokenizer ex: mT5 pretrained_cfg.encoder_tokenizer.sentencepiece_legacy = True pretrained_cfg.decoder_tokenizer.sentencepiece_legacy = True # Override dropout pretrained_cfg.hidden_dropout = cfg.model.hidden_dropout pretrained_cfg.attention_dropout = cfg.model.attention_dropout # Override precision pretrained_cfg.precision = trainer.precision # Set above from trainer.precision # Override micro/global batch pretrained_cfg.micro_batch_size = cfg.model.micro_batch_size pretrained_cfg.global_batch_size = cfg.model.global_batch_size # O2 AMP pretrained_cfg.megatron_amp_O2 = cfg.model.get('megatron_amp_O2', False) # Override data and global/micro batch size. pretrained_cfg.train_ds = cfg.model.train_ds pretrained_cfg.train_ds.micro_batch_size = cfg.model.micro_batch_size pretrained_cfg.train_ds.global_batch_size = cfg.model.global_batch_size pretrained_cfg.validation_ds = cfg.model.validation_ds pretrained_cfg.test_ds = cfg.model.test_ds # Class target for the new class being restored. pretrained_cfg.target = ( "nemo.collections.nlp.models.machine_translation.megatron_nmt_model.MegatronNMTModel" ) # Optimizer overrides. pretrained_cfg.optim = cfg.model.optim model = MegatronNMTModel.restore_from( cfg.model.pretrained_model_path, trainer=trainer, override_config_path=pretrained_cfg, save_restore_connector=NLPSaveRestoreConnector(), ) else: model = MegatronNMTModel(cfg.model, trainer) if cfg.do_training: trainer.fit(model) if cfg.do_testing: trainer.test(model)
def main(cfg: DictConfig) -> None: logging.info( 'During evaluation/testing, it is currently advisable to construct a new Trainer with single GPU and \ no DDP to obtain accurate results') if not hasattr(cfg.model, 'test_ds'): raise ValueError( f'model.test_ds was not found in the config, skipping evaluation') else: gpu = 1 if cfg.trainer.gpus != 0 else 0 trainer = pl.Trainer( gpus=gpu, precision=cfg.trainer.precision, amp_level=cfg.trainer.amp_level, logger=False, checkpoint_callback=False, ) exp_dir = exp_manager(trainer, cfg.exp_manager) if not cfg.pretrained_model: raise ValueError( 'To run evaluation and inference script a pre-trained model or .nemo file must be provided.' f'Choose from {TokenClassificationModel.list_available_models()} or "pretrained_model"="your_model.nemo"' ) if os.path.exists(cfg.pretrained_model): model = TokenClassificationModel.restore_from(cfg.pretrained_model) elif cfg.pretrained_model in TokenClassificationModel.get_available_model_names( ): model = TokenClassificationModel.from_pretrained(cfg.pretrained_model) else: raise ValueError( f'Provide path to the pre-trained .nemo checkpoint or choose from {TokenClassificationModel.list_available_models()}' ) data_dir = cfg.model.dataset.get('data_dir', None) if not data_dir: raise ValueError( 'Specify a valid dataset directory that contains test_ds.text_file and test_ds.labels_file \ with "model.dataset.data_dir" argument') if not os.path.exists(data_dir): raise ValueError(f'{data_dir} is not found at') model.update_data_dir(data_dir=data_dir) model._cfg.dataset.use_cache = False if not hasattr(cfg.model, 'test_ds'): raise ValueError( f'model.test_ds was not found in the config, skipping evaluation') else: if model.prepare_test(trainer): model.setup_test_data() trainer.test(model) else: raise ValueError('Terminating evaluation') model.evaluate_from_file( text_file=os.path.join(data_dir, cfg.model.test_ds.text_file), labels_file=os.path.join(data_dir, cfg.model.test_ds.labels_file), output_dir=exp_dir, add_confusion_matrix=True, normalize_confusion_matrix=True, ) # run an inference on a few examples queries = [ 'we bought four shirts from the nvidia gear store in santa clara.', 'Nvidia is a company.' ] results = model.add_predictions(queries, output_file='predictions.txt') for query, result in zip(queries, results): logging.info(f'Query : {query}') logging.info(f'Result: {result.strip()}\n') logging.info(f'Results are saved at {exp_dir}')
def main(cfg: DictConfig) -> None: logging.info(f'Config Params:\n {OmegaConf.to_yaml(cfg)}') trainer = pl.Trainer(**cfg.trainer) exp_manager(trainer, cfg.get("exp_manager", None)) # initialize the model using the config file model = MultiLabelIntentSlotClassificationModel(cfg.model, trainer=trainer) # training logging.info( "================================================================================================" ) logging.info('Starting training...') trainer.fit(model) logging.info('Training finished!') # Stop further testing as fast_dev_run does not save checkpoints if trainer.fast_dev_run: return # after model training is done, you can load the model from the saved checkpoint # and evaluate it on a data file or on given queries. logging.info( "================================================================================================" ) logging.info("Starting the testing of the trained model on test set...") logging.info( "We will load the latest model saved checkpoint from the training...") # for evaluation and inference you can load the previously trained model saved in .nemo file # like this in your code, but we will just reuse the trained model here # eval_model = MultiLabelIntentSlotClassificationModel.restore_from(restore_path=checkpoint_path) eval_model = model # we will setup testing data reusing the same config (test section) eval_model.update_data_dir_for_testing(data_dir=cfg.model.data_dir) eval_model.setup_test_data(test_data_config=cfg.model.test_ds) trainer.test(model=eval_model, ckpt_path=None, verbose=False) logging.info("Testing finished!") # Optimize Threshold eval_model.optimize_threshold(cfg.model.test_ds, 'dev') # run an inference on a few examples logging.info( "======================================================================================" ) logging.info("Evaluate the model on the given queries...") # this will work well if you train the model on ATIS dataset # for your own dataset change the examples appropriately queries = [ 'i would like to find a flight from charlotte to las vegas that makes a stop in st. louis', 'on april first i need a ticket from tacoma to san jose departing before 7 am', 'how much is the limousine service in boston', ] # We use the optimized threshold for predictions pred_intents, pred_slots, pred_list = eval_model.predict_from_examples( queries, cfg.model.test_ds) logging.info( 'The prediction results of some sample queries with the trained model:' ) for query, intent, slots in zip(queries, pred_intents, pred_slots): logging.info(f'Query : {query}') logging.info(f'Predicted Intents: {intent}') logging.info(f'Predicted Slots: {slots}') logging.info("Inference finished!")
def main(cfg) -> None: logging.info("\n\n************** Experiment configuration ***********") logging.info(f'\n{OmegaConf.to_yaml(cfg)}') megatron_amp_o2 = cfg.model.get('megatron_amp_O2', False) plugins = [ NLPDDPPlugin( no_ddp_communication_hook=True, gradient_as_bucket_view=cfg.model.gradient_as_bucket_view, find_unused_parameters=False, ) ] if cfg.trainer.precision in [16, 'bf16']: scaler = None if cfg.trainer.precision == 16: scaler = GradScaler( init_scale=cfg.model.get('native_amp_init_scale', 2**32), growth_interval=cfg.model.get('native_amp_growth_interval', 1000), hysteresis=cfg.model.get('hysteresis', 2), ) if megatron_amp_o2: plugins.append( MegatronHalfPrecisionPlugin(precision=cfg.trainer.precision, device='cuda', scaler=scaler)) else: plugins.append( NativeMixedPrecisionPlugin(precision=cfg.trainer.precision, device='cuda', scaler=scaler)) if cfg.get('cluster_type', None) == 'BCP': plugins.append(TorchElasticEnvironment()) trainer = Trainer(plugins=plugins, **cfg.trainer) exp_manager(trainer, cfg.exp_manager) # Override timer callback to a stateless one for idx, callback in enumerate(trainer.callbacks): if isinstance(callback, Timer): trainer.callbacks[idx] = StatelessTimer(cfg.trainer.max_time, ) # Get the T5 Base configuration. t5_cfg = MegatronT5GLUEModel.restore_from( restore_path=cfg.model.restore_from_path, trainer=trainer, return_config=True) # Override the T5 configuration with the one from the config file. # NOTE: Only data can be overriden here since this the file being restored here should already correspond to a GLUE/XNLI finetuned model. OmegaConf.set_struct(t5_cfg, True) with open_dict(t5_cfg): t5_cfg.masked_softmax_fusion = False t5_cfg.precision = cfg.trainer.precision # Overwrite data configs t5_cfg.data = cfg.model.data # XNLI has eval languages in the yaml config. if hasattr(cfg.model, 'eval_languages'): t5_cfg.eval_languages = cfg.model.eval_languages if hasattr(t5_cfg.data.validation_ds, 'task_name'): model = MegatronT5GLUEModel.restore_from( restore_path=cfg.model.restore_from_path, trainer=trainer, override_config_path=t5_cfg) else: model = MegatronT5FinetuneModel.restore_from( restore_path=cfg.model.restore_from_path, trainer=trainer, override_config_path=t5_cfg) model.freeze() trainer.validate(model) if hasattr(cfg.model.data, 'test_ds'): trainer.test(model)
def main(cfg): trainer = pl.Trainer(**cfg.trainer) exp_manager(trainer, cfg.get("exp_manager", None)) asr_model = EncDecCTCModel(cfg=cfg.model, trainer=trainer) trainer.fit(asr_model)
def main(cfg): trainer = pl.Trainer(**cfg.trainer) exp_manager(trainer, cfg.get("exp_manager", None)) model = HifiGanModel(cfg=cfg.model, trainer=trainer) model.maybe_init_from_pretrained_checkpoint(cfg=cfg) trainer.fit(model)
def main(cfg: DictConfig) -> None: pl.seed_everything(42) logging.info(f'Config: {OmegaConf.to_yaml(cfg)}') try: plugin = NLPDDPPlugin() except (ImportError, ModuleNotFoundError): plugin = None trainer = pl.Trainer(**cfg.trainer, plugins=plugin) exp_manager(trainer, cfg.get("exp_manager", None)) app_state = AppState() if cfg.model.tensor_model_parallel_size > 1: app_state.model_parallel_size = cfg.model.tensor_model_parallel_size app_state.model_parallel_rank = compute_model_parallel_rank( trainer.local_rank, app_state.model_parallel_size) if 'bert' in cfg.model.language_model.pretrained_model_name: if cfg.model.dataset.task == 'sgd': if cfg.model.original_nemo_checkpoint is not None: model_class = DialogueZeroShotIntentModel else: model_class = SGDQAModel elif cfg.model.dataset.task in ['zero_shot', 'design']: model_class = DialogueZeroShotIntentModel else: model_class = IntentSlotClassificationModel elif 'gpt' in cfg.model.language_model.pretrained_model_name.lower(): if cfg.model.dataset.task in ['ms_marco', 'mellon_qa']: model_class = DialogueGPTGenerationModel else: model_class = DialogueGPTClassificationModel elif ('bart' in cfg.model.language_model.pretrained_model_name.lower() or 't5' in cfg.model.language_model.pretrained_model_name.lower()): # please use bf16/32 with t5-large and above # see https://github.com/huggingface/transformers/pull/10956 model_class = DialogueS2SGenerationModel elif 'sentence-transformers' in cfg.model.language_model.pretrained_model_name.lower( ): model_class = DialogueNearestNeighbourModel if cfg.pretrained_model or (cfg.model.nemo_path and os.path.exists(cfg.model.nemo_path)): if cfg.pretrained_model: logging.info(f'Loading pretrained model {cfg.pretrained_model}') model = model_class.from_pretrained(cfg.pretrained_model) else: logging.info(f'Restoring model from {cfg.model.nemo_path}') model = model_class.restore_from(cfg.model.nemo_path) if cfg.do_training: model.setup_training_data(train_data_config=cfg.model.train_ds) model.setup_multiple_validation_data( val_data_config=cfg.model.validation_ds) else: logging.info(f'Config: {OmegaConf.to_yaml(cfg)}') model = model_class(cfg.model, trainer=trainer) if cfg.do_training: trainer.fit(model) if cfg.model.nemo_path: model.save_to(cfg.model.nemo_path) else: data_dir = cfg.model.dataset.get('data_dir', None) dialogues_example_dir = cfg.model.dataset.get('dialogues_example_dir', None) if data_dir is None or dialogues_example_dir is None: raise ValueError( 'No dataset directory provided. Skipping evaluation. ') elif not os.path.exists(data_dir): raise ValueError( f'{data_dir} is not found, skipping evaluation on the test set.' ) else: if hasattr(model, "update_data_dirs"): model.update_data_dirs( data_dir=data_dir, dialogues_example_dir=dialogues_example_dir) model._cfg.dataset = cfg.model.dataset if hasattr(cfg.model, 'test_ds') and cfg.model.test_ds.ds_item is not None: eval_device = [cfg.trainer.devices[0]] if isinstance( cfg.trainer.devices, list) else 1 trainer = pl.Trainer(devices=eval_device, accelerator=cfg.trainer.accelerator, precision=16) model.setup_multiple_test_data(test_data_config=cfg.model.test_ds) if model.prepare_test(trainer): trainer.test(model)
def main(cfg: DictConfig) -> None: logging.info(f'\nConfig Params:\n{cfg.pretty()}') trainer = pl.Trainer(plugins=[NLPDDPPlugin()], **cfg.trainer) exp_manager(trainer, cfg.get("exp_manager", None)) if not cfg.model.train_ds.file_path: raise ValueError( "'train_ds.file_path' need to be set for the training!") model = TextClassificationModel(cfg.model, trainer=trainer) logging.info( "===========================================================================================" ) logging.info('Starting training...') trainer.fit(model) logging.info('Training finished!') logging.info( "===========================================================================================" ) if cfg.model.nemo_path: # '.nemo' file contains the last checkpoint and the params to initialize the model model.save_to(cfg.model.nemo_path) logging.info( f'Model is saved into `.nemo` file: {cfg.model.nemo_path}') # We evaluate the trained model on the test set if test_ds is set in the config file if cfg.model.test_ds.file_path: logging.info( "===========================================================================================" ) logging.info( "Starting the testing of the trained model on test set...") trainer.test(model=model, ckpt_path=None, verbose=False) logging.info("Testing finished!") logging.info( "===========================================================================================" ) # perform inference on a list of queries. if "infer_samples" in cfg.model and cfg.model.infer_samples: logging.info( "===========================================================================================" ) logging.info("Starting the inference on some sample queries...") # max_seq_length=512 is the maximum length BERT supports. results = model.classifytext(queries=cfg.model.infer_samples, batch_size=16, max_seq_length=512) logging.info( 'The prediction results of some sample queries with the trained model:' ) for query, result in zip(cfg.model.infer_samples, results): logging.info(f'Query : {query}') logging.info(f'Predicted label: {result}') logging.info("Inference finished!") logging.info( "===========================================================================================" )
def test_omegaconf(self): """Ensure omegaconf raises an error when an unexcepted argument is passed""" with pytest.raises(OmegaConfBaseException): exp_manager(None, {"unused": 1})
def main(): parser = ArgumentParser() parser.add_argument( "--pretrained_model", type=str, default="speakerverification_speakernet", required=False, help="Pass your trained .nemo model", ) parser.add_argument( "--finetune_config_file", type=str, required=True, help= "path to speakernet config yaml file to load train, validation dataset and also for trainer parameters", ) parser.add_argument( "--freeze_encoder", type=bool, required=False, default=True, help= "True if speakernet encoder paramteres needs to be frozen while finetuning", ) args = parser.parse_args() if args.pretrained_model.endswith('.nemo'): logging.info(f"Using local speaker model from {args.pretrained_model}") speaker_model = EncDecSpeakerLabelModel.restore_from( restore_path=args.pretrained_model) elif args.pretrained_model.endswith('.ckpt'): logging.info( f"Using local speaker model from checkpoint {args.pretrained_model}" ) speaker_model = EncDecSpeakerLabelModel.load_from_checkpoint( checkpoint_path=args.pretrained_model) else: logging.info("Using pretrained speaker recognition model from NGC") speaker_model = EncDecSpeakerLabelModel.from_pretrained( model_name=args.pretrained_model) finetune_config = OmegaConf.load(args.finetune_config_file) if 'test_ds' in finetune_config.model and finetune_config.model.test_ds is not None: finetune_config.model.test_ds = None logging.warning("Removing test ds") speaker_model.setup_finetune_model(finetune_config.model) finetune_trainer = pl.Trainer(**finetune_config.trainer) speaker_model.set_trainer(finetune_trainer) _ = exp_manager(finetune_trainer, finetune_config.get('exp_manager', None)) speaker_model.setup_optimization(finetune_config.optim) if args.freeze_encoder: for param in speaker_model.encoder.parameters(): param.requires_grad = False finetune_trainer.fit(speaker_model)
def main(cfg: DictConfig) -> None: logging.info(f'\nConfig Params:\n{OmegaConf.to_yaml(cfg)}') trainer = pl.Trainer(plugins=[NLPDDPPlugin()], **cfg.trainer) exp_manager(trainer, cfg.get("exp_manager", None)) if not cfg.model.train_ds.file_path: raise ValueError( "'train_ds.file_path' need to be set for the training!") model = PTuneTextClassificationModel(cfg.model, trainer=trainer) logging.info( "===========================================================================================" ) logging.info('Starting training...') trainer.fit(model) logging.info('Training finished!') logging.info( "===========================================================================================" ) # We evaluate the trained model on the test set if test_ds is set in the config file if cfg.model.test_ds.file_path: logging.info( "===========================================================================================" ) logging.info( "Starting the testing of the trained model on test set...") trainer.test(model=model, ckpt_path=None, verbose=False) logging.info("Testing finished!") logging.info( "===========================================================================================" ) # extract the path of the best checkpoint from the training, you may update it to any checkpoint checkpoint_path = trainer.checkpoint_callback.best_model_path tensor_parallel_size = cfg.model.tensor_model_parallel_size pathobj = pathlib.Path(checkpoint_path) checkpoint_folder = str(pathobj.parent) checkpoint_name = str(pathobj.name) rank = trainer.accelerator.training_type_plugin.local_rank if tensor_parallel_size > 1: # inject model parallel rank checkpoint_path = os.path.join(checkpoint_folder, f'mp_rank_{rank:02d}', checkpoint_name) else: checkpoint_path = os.path.join(checkpoint_folder, checkpoint_name) # Load the checkpoint best_eval_model = PTuneTextClassificationModel.load_from_checkpoint( checkpoint_path=checkpoint_path, strict=False, trainer=trainer) logging.info(f'best checkpoint path: {checkpoint_path}') logging.info("Running Test with best EVAL checkpoint!") # setup the test dataset best_eval_model.setup_test_data(test_data_config=cfg.model.test_ds) if torch.distributed.is_initialized(): torch.distributed.barrier() trainer.test(model=best_eval_model, ckpt_path=None, verbose=False) logging.info("Beset EVAL Testing finished!") logging.info( "===========================================================================================" ) if cfg.model.nemo_path: # '.nemo' file contains the last checkpoint and the params to initialize the model best_eval_model.save_to(cfg.model.nemo_path) logging.info( f'Model is saved into `.nemo` file: {cfg.model.nemo_path}') # perform inference on a list of queries. if "infer_samples" in cfg.model and cfg.model.infer_samples: logging.info( "===========================================================================================" ) logging.info("Starting the inference on some sample queries...") # max_seq_length=512 is the maximum length BERT supports. results = best_eval_model.cuda().classifytext( queries=cfg.model.infer_samples, batch_size=1, prompt='Sentiment') logging.info( 'The prediction results of some sample queries with the trained model:' ) for query, result in zip(cfg.model.infer_samples, results): logging.info(f'Query : {query}') logging.info(f'Predicted label: {result}') logging.info("Inference finished!") logging.info( "===========================================================================================" )
def main(cfg: DictConfig) -> None: logging.info(f'Config: {cfg.pretty()}') trainer = pl.Trainer(**cfg.trainer) exp_manager(trainer, cfg.get("exp_manager", None)) bert_joint_ir_model = BertJointIRModel(cfg.model, trainer=trainer) trainer.fit(bert_joint_ir_model)
def main(cfg) -> None: logging.info("\n\n************** Experiment configuration ***********") logging.info(f'\n{OmegaConf.to_yaml(cfg)}') megatron_amp_o2 = cfg.model.get('megatron_amp_O2', False) plugins = [ NLPDDPPlugin( no_ddp_communication_hook= True, # we don't use DDP for async grad allreduce gradient_as_bucket_view=cfg.model.gradient_as_bucket_view, find_unused_parameters=False, ) ] if cfg.trainer.precision in [16, 'bf16']: scaler = None if cfg.trainer.precision == 16: scaler = GradScaler( init_scale=cfg.model.get('native_amp_init_scale', 2**32), growth_interval=cfg.model.get('native_amp_growth_interval', 1000), hysteresis=cfg.model.get('hysteresis', 2), ) if megatron_amp_o2: plugins.append( MegatronHalfPrecisionPlugin(precision=cfg.trainer.precision, device='cuda', scaler=scaler)) else: plugins.append( PipelineMixedPrecisionPlugin(precision=cfg.trainer.precision, device='cuda', scaler=scaler)) if cfg.get('cluster_type', None) == 'BCP': plugins.append(TorchElasticEnvironment()) trainer = Trainer(plugins=plugins, **cfg.trainer) exp_manager(trainer, cfg.exp_manager) # update resume from checkpoint found by exp_manager if cfg.model.resume_from_checkpoint is not None: resume_from_checkpoint = cfg.model.resume_from_checkpoint else: resume_from_checkpoint = trainer._checkpoint_connector.resume_from_checkpoint_fit_path logging.info( f'Resuming training from checkpoint: {resume_from_checkpoint}') trainer._checkpoint_connector = CheckpointConnector( trainer, resume_from_checkpoint=resume_from_checkpoint) # Override timer callback to a stateless one for idx, callback in enumerate(trainer.callbacks): if isinstance(callback, Timer): trainer.callbacks[idx] = StatelessTimer(cfg.trainer.max_time, ) # hydra interpolation does not work here as the interpolation key is lost when PTL saves hparams with open_dict(cfg): cfg.model.precision = cfg.trainer.precision model = MegatronGPTModel(cfg.model, trainer) trainer.fit(model)
def test_trainer_loggers(self, tmp_path): """ Test that a trainer with logger errors out with a number of arguments. Test that it works with create_tensorboard_logger set to False """ test_trainer = pl.Trainer( accelerator='cpu') # Should create logger and modelcheckpoint with pytest.raises(LoggerMisconfigurationError ): # Fails because exp_manager defaults to trainer exp_manager(test_trainer, {"exp_dir": str(tmp_path)}) with pytest.raises(LoggerMisconfigurationError ): # Fails because exp_manager defaults to trainer exp_manager(test_trainer, {"explicit_log_dir": str(tmp_path)}) with pytest.raises(LoggerMisconfigurationError ): # Fails because exp_manager defaults to trainer exp_manager(test_trainer, {"resume_if_exists": True}) # Check that exp_manager uses trainer.logger, it's exp_dir, name, and version log_dir = exp_manager( test_trainer, { "create_tensorboard_logger": False, "create_checkpoint_callback": False }) assert log_dir.resolve() == Path( "./lightning_logs/version_0").resolve() assert Path("./lightning_logs").exists() assert Path("./lightning_logs/version_0").exists() # Check that a trainer without a logger gets a logger attached to it test_trainer = pl.Trainer(accelerator='cpu', logger=False) log_dir = exp_manager( test_trainer, { "create_tensorboard_logger": True, "create_checkpoint_callback": False, "exp_dir": str(tmp_path) }, ) assert isinstance(test_trainer.logger, pl.loggers.TensorBoardLogger) test_trainer = pl.Trainer(accelerator='cpu', logger=False) # Check that a create_wandb_logger=True errors out unless wandb_logger_kwargs is passed. with pytest.raises(ValueError): log_dir = exp_manager( test_trainer, { "create_tensorboard_logger": False, "create_checkpoint_callback": False, "exp_dir": str(tmp_path), "create_wandb_logger": True, }, ) # Check that a WandbLogger is attached to logger if create_wandb_logger=True and wandb_logger_kwargs has name # and project log_dir = exp_manager( test_trainer, { "create_tensorboard_logger": False, "create_checkpoint_callback": False, "exp_dir": str(tmp_path), "create_wandb_logger": True, "wandb_logger_kwargs": { "name": "", "project": "" }, }, ) assert isinstance(test_trainer.logger, pl.loggers.WandbLogger)
config.trainer.gpus = 1 if torch.cuda.is_available() else 0 config.trainer.precision = 16 if torch.cuda.is_available( ) else 32 # For mixed precision training, use precision=16 and amp_level=O1 config.trainer.max_epochs = args.epochs config.trainer.accelerator = None # Remove distributed training flags if args.output != '': config.exp_manager.exp_dir = args.output print(OmegaConf.to_yaml(config)) # create trainer + model trainer = pl.Trainer(**config.trainer) model = nemo_nlp.models.QAModel(cfg=config.model, trainer=trainer) exp_dir = str(exp_manager(trainer, config.get("exp_manager", None))) print('experiment directory:', exp_dir) # start the training trainer.fit(model) # test the model model.setup_test_data(test_data_config=config.model.test_ds) trainer.test(model) # example inference all_preds, all_nbests = model.inference( file=config.model.test_ds.file, output_nbest_file=os.path.join(exp_dir, 'output_prediction.json'), output_prediction_file=os.path.join(exp_dir, 'output_nbest.json'),
def test_resume(self, tmp_path): """ Tests the resume capabilities of exp_manager""" test_trainer = pl.Trainer(accelerator='cpu', enable_checkpointing=False, logger=False) # Error because explicit_log_dir does not exist with pytest.raises(NotFoundError): exp_manager( test_trainer, { "exp_dir": str(tmp_path / "test_resume"), "resume_if_exists": True, "explicit_log_dir": "Does_not_exist", }, ) # Error because checkpoints folder does not exist with pytest.raises(NotFoundError): exp_manager(test_trainer, { "resume_if_exists": True, "exp_dir": str(tmp_path / "test_resume") }) # No error because we tell exp_manager to ignore notfounderror exp_manager( test_trainer, { "resume_if_exists": True, "exp_dir": str(tmp_path / "test_resume_2"), "resume_ignore_no_checkpoint": True, }, ) test_trainer = pl.Trainer(accelerator='cpu', enable_checkpointing=False, logger=False) Path(tmp_path / "test_resume" / "default" / "version_0" / "checkpoints").mkdir(parents=True) # Error because checkpoints do not exist in folder with pytest.raises(NotFoundError): exp_manager( test_trainer, { "resume_if_exists": True, "explicit_log_dir": str(tmp_path / "test_resume" / "default" / "version_0"), }, ) Path(tmp_path / "test_resume" / "default" / "version_0" / "checkpoints" / "mymodel--end.ckpt").touch() # Error because *end.ckpt is in folder indicating that training has already finished with pytest.raises(ValueError): exp_manager( test_trainer, { "resume_if_exists": True, "explicit_log_dir": str(tmp_path / "test_resume" / "default" / "version_0"), }, ) Path(tmp_path / "test_resume" / "default" / "version_0" / "checkpoints" / "mymodel--end.ckpt").unlink() Path(tmp_path / "test_resume" / "default" / "version_0" / "checkpoints" / "mymodel--last.ckpt").touch() Path(tmp_path / "test_resume" / "default" / "version_0" / "checkpoints" / "mymodel2--last.ckpt").touch() # Error because multiple *last.ckpt is in folder. If more than one, don't know which to restore with pytest.raises(ValueError): exp_manager( test_trainer, { "resume_if_exists": True, "explicit_log_dir": str(tmp_path / "test_resume" / "default" / "version_0"), }, ) # Finally succeed Path(tmp_path / "test_resume" / "default" / "version_0" / "checkpoints" / "mymodel2--last.ckpt").unlink() log_dir = exp_manager( test_trainer, { "resume_if_exists": True, "explicit_log_dir": str(tmp_path / "test_resume" / "default" / "version_0") }, ) checkpoint = Path(tmp_path / "test_resume" / "default" / "version_0" / "checkpoints" / "mymodel--last.ckpt") assert (Path( test_trainer.checkpoint_connector.resume_from_checkpoint_fit_path). resolve() == checkpoint.resolve()) # Succeed again and make sure that run_0 exists and previous log files were moved test_trainer = pl.Trainer(accelerator='cpu', enable_checkpointing=False, logger=False) exp_manager(test_trainer, { "resume_if_exists": True, "explicit_log_dir": str(log_dir) }) checkpoint = Path(tmp_path / "test_resume" / "default" / "version_0" / "checkpoints" / "mymodel--last.ckpt") assert (Path( test_trainer.checkpoint_connector.resume_from_checkpoint_fit_path). resolve() == checkpoint.resolve()) prev_run_dir = Path(tmp_path / "test_resume" / "default" / "version_0" / "run_0") assert prev_run_dir.exists() prev_log = Path(tmp_path / "test_resume" / "default" / "version_0" / "run_0" / "lightning_logs.txt") assert prev_log.exists()
def main(cfg: DictConfig) -> None: logging.info( 'During evaluation/testing, it is currently advisable to construct a new Trainer with single GPU and \ no DDP to obtain accurate results') if not hasattr(cfg.model, 'test_ds'): raise ValueError( f'model.test_ds was not found in the config, skipping evaluation') else: gpu = 1 if cfg.trainer.gpus != 0 else 0 trainer = pl.Trainer( gpus=gpu, precision=cfg.trainer.precision, amp_level=cfg.trainer.amp_level, logger=False, checkpoint_callback=False, ) exp_dir = exp_manager(trainer, cfg.exp_manager) if not cfg.pretrained_model: raise ValueError( 'To run evaluation and inference script a pre-trained model or .nemo file must be provided.' f'Choose from {PunctuationCapitalizationModel.list_available_models()} or "pretrained_model"="your_model.nemo"' ) if os.path.exists(cfg.pretrained_model): model = PunctuationCapitalizationModel.restore_from( cfg.pretrained_model) elif cfg.pretrained_model in PunctuationCapitalizationModel.get_available_model_names( ): model = PunctuationCapitalizationModel.from_pretrained( cfg.pretrained_model) else: raise ValueError( f'Provide path to the pre-trained .nemo file or choose from {PunctuationCapitalizationModel.list_available_models()}' ) data_dir = cfg.model.dataset.get('data_dir', None) if data_dir is None: logging.error( 'No dataset directory provided. Skipping evaluation. ' 'To run evaluation on a file, specify path to the directory that contains test_ds.text_file and test_ds.labels_file with "model.dataset.data_dir" argument.' ) elif not os.path.exists(data_dir): logging.error( f'{data_dir} is not found, skipping evaluation on the test set.') else: model.update_data_dir(data_dir=data_dir) model._cfg.dataset = cfg.model.dataset if not hasattr(cfg.model, 'test_ds'): logging.error( f'model.test_ds was not found in the config, skipping evaluation' ) elif model.prepare_test(trainer): model.setup_test_data(cfg.model.test_ds) trainer.test(model) else: logging.error( 'Skipping the evaluation. The trainer is not setup properly.') # run an inference on a few examples queries = [ 'we bought four shirts one pen and a mug from the nvidia gear store in santa clara', 'what can i do for you today', 'how are you', ] inference_results = model.add_punctuation_capitalization( queries, batch_size=len(queries), max_seq_length=512) for query, result in zip(queries, inference_results): logging.info(f'Query : {query}') logging.info(f'Result: {result.strip()}\n') logging.info(f'Results are saved at {exp_dir}')
def main(cfg: DictConfig) -> None: trainer = pl.Trainer(**cfg.trainer) exp_dir = exp_manager(trainer, cfg.get("exp_manager", None)) do_training = True if not cfg.pretrained_model: logging.info(f'Config: {OmegaConf.to_yaml(cfg)}') model = TokenClassificationModel(cfg.model, trainer=trainer) else: logging.info(f'Loading pretrained model {cfg.pretrained_model}') model = TokenClassificationModel.from_pretrained(cfg.pretrained_model) data_dir = cfg.model.dataset.get('data_dir', None) if data_dir: # we can also do finetunining of the pretrained model but it will require # setting up train and validation Pytorch DataLoaders # setup the data dir to get class weights statistics model.update_data_dir(data_dir=data_dir) # then we're setting up loss, use model.dataset.class_balancing, # if you want to add class weights to the CrossEntropyLoss model.setup_loss(class_balancing=cfg.model.dataset.class_balancing) # finally, setup train and validation Pytorch DataLoaders model.setup_training_data() model.setup_validation_data() logging.info(f'Using config file of the pretrained model') else: do_training = False logging.info( f'Data dir should be specified for finetuning the pretrained model. ' f'Using pretrained {cfg.pretrained_model} model weights and skipping finetuning.' ) if do_training: trainer.fit(model) if cfg.model.nemo_path: model.save_to(cfg.model.nemo_path) """ After model training is done, you can use the model for inference. You can either evaluate data from a text_file that follows training data format, or provide a list of queries you want to add entities to During evaluation/testing, it is currently advisable to construct a new Trainer with single GPU and no DDP to obtain accurate results """ logging.info( 'During evaluation/testing, it is currently advisable to construct a new Trainer with single GPU ' 'and no DDP to obtain accurate results' ) gpu = 1 if cfg.trainer.gpus != 0 else 0 trainer = pl.Trainer(gpus=gpu) model.set_trainer(trainer) if do_training: # run evaluation on a dataset from file # only possible if model.dataset.data_dir is specified # change the path to the file you want to use for the final evaluation model.evaluate_from_file( text_file=os.path.join(cfg.model.dataset.data_dir, cfg.model.validation_ds.text_file), labels_file=os.path.join(cfg.model.dataset.data_dir, cfg.model.validation_ds.labels_file), output_dir=exp_dir, add_confusion_matrix=True, normalize_confusion_matrix=True, ) # run an inference on a few examples queries = ['we bought four shirts from the nvidia gear store in santa clara.', 'Nvidia is a company.'] results = model.add_predictions(queries) for query, result in zip(queries, results): logging.info(f'Query : {query}') logging.info(f'Result: {result.strip()}\n')
def main(cfg): trainer = pl.Trainer(**cfg.trainer) exp_manager(trainer, cfg.get("exp_manager", None)) model = HifiGanModel(cfg=cfg.model, trainer=trainer) trainer.fit(model)