def setup_finetune_model(self, model_config: DictConfig): """ setup_finetune_model method sets up training data, validation data and test data with new provided config, this checks for the previous labels set up during training from scratch, if None, it sets up labels for provided finetune data from manifest files Args: model_config: cfg which has train_ds, optional validation_ds, optional test_ds and mandatory encoder and decoder model params make sure you set num_classes correctly for finetune data Returns: None """ if hasattr(self, 'dataset'): scratch_labels = self.dataset.labels else: scratch_labels = None logging.info("Setting up data loaders with manifests provided from model_config") if 'train_ds' in model_config and model_config.train_ds is not None: self.setup_training_data(model_config.train_ds) else: raise KeyError("train_ds is not found in model_config but you need it for fine tuning") if self.dataset.labels is None or len(self.dataset.labels) == 0: raise ValueError(f'New labels must be non-empty list of labels. But I got: {self.dataset.labels}') if 'validation_ds' in model_config and model_config.validation_ds is not None: self.setup_multiple_validation_data(model_config.validation_ds) if 'test_ds' in model_config and model_config.test_ds is not None: self.setup_multiple_test_data(model_config.test_ds) if scratch_labels == self.dataset.labels: # checking for new finetune dataset labels logging.warning( "Trained dataset labels are same as finetune dataset labels -- continuing change of decoder parameters" ) elif scratch_labels is None: logging.warning( "Either you provided a dummy manifest file during training from scratch or you restored from a pretrained nemo file" ) decoder_config = model_config.decoder new_decoder_config = copy.deepcopy(decoder_config) if new_decoder_config['num_classes'] != len(self.dataset.labels): raise ValueError( "number of classes provided {} is not same as number of different labels in finetuning data: {}".format( new_decoder_config['num_classes'], len(self.dataset.labels) ) ) del self.decoder self.decoder = EncDecSpeakerLabelModel.from_config_dict(new_decoder_config) with open_dict(self._cfg.decoder): self._cfg.decoder = new_decoder_config logging.info(f"Changed decoder output to # {self.decoder._num_classes} classes.")
def main(cfg) -> None: logging.info("\n\n************** Experiment configuration ***********") logging.info(f'\n{OmegaConf.to_yaml(cfg)}') plugins = [NLPDDPPlugin(num_nodes=cfg.trainer.num_nodes)] trainer = Trainer(plugins=plugins, **cfg.trainer) exp_manager(trainer, cfg.exp_manager) # hydra interpolation does not work here as the interpolation key is lost when PTL saves hparams with open_dict(cfg): cfg.model.precision = cfg.trainer.precision model = MegatronGPTModel.restore_from(cfg.restore_from_path, cfg.model, trainer=trainer) # Init all new prompts for idx, tag in enumerate(cfg.model.new_prompt_tags): init_method = cfg.model.new_prompt_init_methods[idx] if init_method == "text": init_text = cfg.model.new_prompt_init_text[idx] model.init_prompt_from_text(tag, init_text) elif init_method == 'random': model.init_prompt_from_random(tag) else: logging.info(f'\n Soft prompt init method {init_method} is not recognized, please use text or random') logging.info(f'\nCurrent soft prompts include {model.get_prompt_table()}') trainer.fit(model)
def main(cfg) -> None: logging.info("\n\n************** Experiment configuration ***********") logging.info(f'\n{OmegaConf.to_yaml(cfg)}') megatron_amp_o2 = cfg.model.get('megatron_amp_O2', False) plugins = [ NLPDDPPlugin( no_ddp_communication_hook=(megatron_amp_o2 and cfg.trainer.precision == 'bf16' ), # Only bf16 uses fp32_grad_accum. gradient_as_bucket_view=cfg.model.gradient_as_bucket_view, find_unused_parameters=False, ) ] if cfg.trainer.precision in [16, 'bf16']: scaler = None if cfg.trainer.precision == 16: scaler = GradScaler( init_scale=cfg.model.get('native_amp_init_scale', 2**32), growth_interval=cfg.model.get('native_amp_growth_interval', 1000), hysteresis=cfg.model.get('hysteresis', 2), ) if megatron_amp_o2: plugins.append( MegatronHalfPrecisionPlugin(precision=cfg.trainer.precision, device='cuda', scaler=scaler)) else: plugins.append( NativeMixedPrecisionPlugin(precision=cfg.trainer.precision, device='cuda', scaler=scaler)) if cfg.get('cluster_type', None) == 'BCP': plugins.append(TorchElasticEnvironment()) trainer = Trainer(plugins=plugins, **cfg.trainer) exp_manager(trainer, cfg.exp_manager) # update resume from checkpoint found by exp_manager resume_from_checkpoint = trainer.checkpoint_connector.resume_from_checkpoint_fit_path logging.info( f'Resuming training from checkpoint: {resume_from_checkpoint}') trainer.checkpoint_connector = CheckpointConnector( trainer, resume_from_checkpoint=resume_from_checkpoint) # Override timer callback to a stateless one for idx, callback in enumerate(trainer.callbacks): if isinstance(callback, Timer): trainer.callbacks[idx] = StatelessTimer(cfg.trainer.max_time, ) # hydra interpolation does not work here as the interpolation key is lost when PTL saves hparams with open_dict(cfg): cfg.model.precision = cfg.trainer.precision model = MegatronXNlIModel(cfg.model, trainer) trainer.fit(model) trainer.test(model)
def _add_prompt_tag(self, prompt_tag): if not hasattr(self, 'prompt_table'): raise AttributeError( 'Please set "use_soft_prompts" in cfg to True') self.prompt_table.add(prompt_tag) self.prompts_to_tune.add(prompt_tag) # Add new prompt tag to cfg for loading prompt table at inference with open_dict(self.cfg): self.cfg.existing_prompt_tags = list(self.prompt_table)
def main(cfg) -> None: logging.info("\n\n************** Experiment configuration ***********") logging.info(f'\n{OmegaConf.to_yaml(cfg)}') plugins = [NLPDDPPlugin(num_nodes=cfg.trainer.num_nodes)] if cfg.trainer.precision == 16: scaler = GradScaler( init_scale=cfg.model.get('native_amp_init_scale', 2**32), growth_interval=cfg.model.get('native_amp_growth_interval', 1000), ) plugins.append( NativeMixedPrecisionPlugin(precision=16, device='cuda', scaler=scaler)) if cfg.get('cluster_type', None) == 'BCP': plugins.append(TorchElasticEnvironment()) trainer = Trainer(plugins=plugins, **cfg.trainer) exp_manager(trainer, cfg.exp_manager) # update resume from checkpoint found by exp_manager resume_from_checkpoint = trainer.checkpoint_connector.resume_from_checkpoint_fit_path if resume_from_checkpoint is not None: # inject mp_rank into resume_from_checkpoint if cfg.model.tensor_model_parallel_size is not None and cfg.model.tensor_model_parallel_size > 1: mp_rank = compute_model_parallel_rank( trainer.local_rank, cfg.model.tensor_model_parallel_size) resume_from_checkpoint = Path(resume_from_checkpoint) resume_from_checkpoint = resume_from_checkpoint.parent.parent.joinpath( f'mp_rank_{mp_rank:02d}').joinpath(resume_from_checkpoint.name) resume_from_checkpoint = str(resume_from_checkpoint) logging.info( f'Resuming training from checkpoint: {resume_from_checkpoint}') trainer.checkpoint_connector = CheckpointConnector( trainer, resume_from_checkpoint=resume_from_checkpoint) # Override timer callback to a stateless one for idx, callback in enumerate(trainer.callbacks): if isinstance(callback, Timer): trainer.callbacks[idx] = StatelessTimer(cfg.trainer.max_time, ) # hydra interpolation does not work here as the interpolation key is lost when PTL saves hparams with open_dict(cfg): cfg.model.precision = cfg.trainer.precision model = MegatronT5Model(cfg.model, trainer) trainer.fit(model)
def on_train_end(self): # Save p-tuned prompts to prompt table for inference or future task training if self.virtual_prompt_style == "p-tuning": self.add_ptuned_prompts_to_prompt_table() logging.info( f"All p-tuned prompts where moved to the prompt table.") # Move new tags to existing tag list for loading during inference later with open_dict(self.cfg): self.cfg.existing_tasks = self.existing_tasks + self.new_tasks self.cfg.new_tasks = [] self.cfg.virtual_prompt_style = 'inference' # Save the best nemo model self.save_to(save_path=self.cfg.nemo_path) logging.info(f"The final model was saved to {self.cfg.nemo_path}")
def main(cfg) -> None: logging.info("\n\n************** Experiment configuration ***********") logging.info(f'\n{OmegaConf.to_yaml(cfg)}') plugins = [ NLPDDPPlugin( no_ddp_communication_hook=True, find_unused_parameters=False, ) ] if cfg.trainer.precision == 16: scaler = GradScaler( init_scale=cfg.model.get('native_amp_init_scale', 2**32), growth_interval=cfg.model.get('native_amp_growth_interval', 1000), hysteresis=cfg.model.get('hysteresis', 2), ) plugins.append( PipelineMixedPrecisionPlugin(precision=cfg.trainer.precision, device='cuda', scaler=scaler)) if cfg.get('cluster_type', None) == 'BCP': plugins.append(TorchElasticEnvironment()) trainer = Trainer(plugins=plugins, **cfg.trainer) exp_manager(trainer, cfg.exp_manager) # Override timer callback to a stateless one for idx, callback in enumerate(trainer.callbacks): if isinstance(callback, Timer): trainer.callbacks[idx] = StatelessTimer(cfg.trainer.max_time, ) # hydra interpolation does not work here as the interpolation key is lost when PTL saves hparams with open_dict(cfg): cfg.model.precision = cfg.trainer.precision # load existing or init new soft prompt GPT model if cfg.model.get("restore_path", None): model = MegatronGPTPromptLearningModel.restore_from( cfg.model.restore_path, cfg.model, trainer=trainer, save_restore_connector=NLPSaveRestoreConnector()) else: model = MegatronGPTPromptLearningModel(cfg.model, trainer=trainer) trainer.fit(model)
def on_train_end(self): # Save p-tuned prompts to prompt table for inference or future task training if self.virtual_prompt_style == VirtualPromptStyle.P_TUNING: self.add_ptuned_prompts_to_prompt_table() logging.info( f"All p-tuned prompts where moved to the prompt table.") self.virtual_prompt_style = VirtualPromptStyle.INFERENCE self.virtual_prompt_source = VirtualPromptSource.PROMPT_TABLE # Move new tags to existing tag list for loading during inference later with open_dict(self.cfg): self.cfg.existing_tasks = self.existing_tasks + self.new_tasks self.cfg.new_tasks = [] self.cfg.virtual_prompt_style = VirtualPromptStyle.INFERENCE.value # Save the best nemo model self.save_to(save_path=self.cfg.nemo_path) logging.info(f"The final model was saved to {self.cfg.nemo_path}")
def main(cfg) -> None: logging.info("\n\n************** Experiment configuration ***********") logging.info(f'\n{OmegaConf.to_yaml(cfg)}') plugins = [NLPDDPPlugin(find_unused_parameters=False)] if cfg.trainer.precision == 16: scaler = GradScaler( init_scale=cfg.model.get('native_amp_init_scale', 2**32), growth_interval=cfg.model.get('native_amp_growth_interval', 1000), ) plugins.append( NativeMixedPrecisionPlugin(precision=16, device='cuda', scaler=scaler)) if cfg.get('cluster_type', None) == 'BCP': plugins.append(TorchElasticEnvironment()) trainer = Trainer(plugins=plugins, **cfg.trainer) exp_manager(trainer, cfg.exp_manager) # update resume from checkpoint found by exp_manager resume_from_checkpoint = trainer.checkpoint_connector.resume_from_checkpoint_fit_path logging.info( f'Resuming training from checkpoint: {resume_from_checkpoint}') trainer.checkpoint_connector = CheckpointConnector( trainer, resume_from_checkpoint=resume_from_checkpoint) # Override timer callback to a stateless one for idx, callback in enumerate(trainer.callbacks): if isinstance(callback, Timer): trainer.callbacks[idx] = StatelessTimer(cfg.trainer.max_time, ) # hydra interpolation does not work here as the interpolation key is lost when PTL saves hparams with open_dict(cfg): cfg.model.precision = cfg.trainer.precision model = MegatronBertModel(cfg.model, trainer) trainer.fit(model)
def main(cfg) -> None: # trainer required for restoring model parallel models trainer = Trainer(plugins=NLPDDPPlugin(), **cfg.trainer) assert ( cfg.trainer.devices * cfg.trainer.num_nodes == cfg.tensor_model_parallel_size * cfg.pipeline_model_parallel_size ), "devices * num_nodes should equal tensor_model_parallel_size * pipeline_model_parallel_size" # Load prompt tuned model, virtual_prompt_model_file must be provided in config if cfg.get('virtual_prompt_model_file', None) is not None: # Update frozen GPT model path in case it has changed prompt_learning_cfg = MegatronGPTPromptLearningModel.restore_from( cfg.virtual_prompt_model_file, trainer=trainer, return_config=True) with open_dict(prompt_learning_cfg): prompt_learning_cfg.language_model_path = cfg.gpt_model_file # Now load prompt learning model with frozen gpt model base model = MegatronGPTPromptLearningModel.restore_from( restore_path=cfg.virtual_prompt_model_file, trainer=trainer, override_config_path=prompt_learning_cfg) # Or load regular GPT model elif cfg.gpt_model_file: model = MegatronGPTModel.restore_from(restore_path=cfg.gpt_model_file, trainer=trainer) elif cfg.checkpoint_dir: app_state = AppState() if cfg.tensor_model_parallel_size > 1 or cfg.pipeline_model_parallel_size > 1: app_state.model_parallel_size = cfg.tensor_model_parallel_size * cfg.pipeline_model_parallel_size ( app_state.tensor_model_parallel_rank, app_state.pipeline_model_parallel_rank, app_state.model_parallel_size, app_state.data_parallel_size, app_state.pipeline_model_parallel_split_rank, ) = fake_initialize_model_parallel( world_size=app_state.model_parallel_size, rank=trainer.global_rank, tensor_model_parallel_size_=cfg.tensor_model_parallel_size, pipeline_model_parallel_size_=cfg.pipeline_model_parallel_size, pipeline_model_parallel_split_rank_=cfg. pipeline_model_parallel_split_rank, ) checkpoint_path = inject_model_parallel_rank( os.path.join(cfg.checkpoint_dir, cfg.checkpoint_name)) model = MegatronGPTModel.load_from_checkpoint( checkpoint_path, hparams_file=cfg.hparams_file, trainer=trainer) else: raise ValueError("need at least a nemo file or checkpoint dir") model.freeze() # Have to turn off activations_checkpoint_method for inference try: model.model.language_model.encoder.activations_checkpoint_method = None except AttributeError: pass try: model.frozen_model.language_model.encoder.activations_checkpoint_method = None except AttributeError: pass length_params: LengthParam = { "max_length": cfg.inference.tokens_to_generate, "min_length": cfg.inference.min_tokens_to_generate, } sampling_params: SamplingParam = { "use_greedy": cfg.inference.greedy, "temperature": cfg.inference.temperature, "top_k": cfg.inference.top_k, "top_p": cfg.inference.top_p, "repetition_penalty": cfg.inference.repetition_penalty, "add_BOS": cfg.inference.add_BOS, "all_probs": cfg.inference.all_probs, "compute_logprob": cfg.inference.compute_logprob, } # First method of running text generation, call model.generate method response = model.generate(inputs=OmegaConf.to_container(cfg.prompts), length_params=length_params, sampling_params=sampling_params) print("***************************") print(response) print("***************************") # Second method of running text generation, call trainer.predict collate_fn = None if cfg.get('virtual_prompt_model', False): collate_fn = lambda x: list(x) ds = RequestDataSet(OmegaConf.to_container(cfg.prompts)) request_dl = DataLoader(dataset=ds, collate_fn=collate_fn, batch_size=2) config = OmegaConf.to_container(cfg.inference) model.set_inference_config(config) response = trainer.predict(model, request_dl) print("***************************") print(response) print("***************************") # Third method of running text generation, use inference server if cfg.server: if parallel_state.is_pipeline_first_stage( ) and parallel_state.get_tensor_model_parallel_rank() == 0: server = MegatronServer(model.cuda()) server.run("0.0.0.0", port=cfg.port) while True: choice = torch.cuda.LongTensor(1) torch.distributed.broadcast(choice, 0) if choice[0].item() == 0: generate(model.cuda())
def main(cfg) -> None: logging.info("\n\n************** Experiment configuration ***********") logging.info(f'\n{OmegaConf.to_yaml(cfg)}') # setup the data processor for processor_config in cfg.model.task_processors: processor = TemplateProcessor( template=processor_config.template, limit_length_field=processor_config.limit_length_field ) register_taskdata_processor(processor_config.taskname, processor) plugins = [NLPDDPPlugin()] if cfg.trainer.precision == 16: scaler = GradScaler( init_scale=cfg.model.get('native_amp_init_scale', 2 ** 32), growth_interval=cfg.model.get('native_amp_growth_interval', 1000), ) plugins.append(NativeMixedPrecisionPlugin(precision=16, device='cuda', scaler=scaler)) if cfg.get('cluster_type', None) == 'BCP': plugins.append(TorchElasticEnvironment()) trainer = Trainer(plugins=plugins, **cfg.trainer) exp_manager(trainer, cfg.exp_manager) # update resume from checkpoint found by exp_manager resume_from_checkpoint = trainer._checkpoint_connector.resume_from_checkpoint_fit_path logging.info(f'Resuming training from checkpoint: {resume_from_checkpoint}') trainer._checkpoint_connector = CheckpointConnector(trainer, resume_from_checkpoint=resume_from_checkpoint) # Override timer callback to a stateless one for idx, callback in enumerate(trainer.callbacks): if isinstance(callback, Timer): trainer.callbacks[idx] = StatelessTimer(cfg.trainer.max_time,) # hydra interpolation does not work here as the interpolation key is lost when PTL saves hparams with open_dict(cfg): cfg.model.precision = cfg.trainer.precision model = MegatronT5PTuneModel(cfg.model, trainer) trainer.fit(model) if cfg.model.data.test_ds.file_path: logging.info("===========================================================================================") logging.info("Starting the testing of the trained model on test set...") trainer.test(model) logging.info("Testing finished!") logging.info("===========================================================================================") # extract the path of the best checkpoint from the training, you may update it to any checkpoint checkpoint_path = trainer.checkpoint_callback.best_model_path tensor_parallel_size = cfg.model.tensor_model_parallel_size pathobj = Path(checkpoint_path) checkpoint_folder = str(pathobj.parent) checkpoint_name = str(pathobj.name) rank = trainer.accelerator.training_type_plugin.local_rank if tensor_parallel_size > 1: # inject model parallel rank checkpoint_path = os.path.join(checkpoint_folder, f'mp_rank_{rank:02d}', checkpoint_name) else: checkpoint_path = os.path.join(checkpoint_folder, checkpoint_name) # Load the checkpoint best_eval_model = MegatronT5PTuneModel.load_from_checkpoint( checkpoint_path=checkpoint_path, strict=False, trainer=trainer ) logging.info(f'Best checkpoint path: {checkpoint_path}') logging.info("Running Test with best EVAL checkpoint!") # setup the test dataset # best_eval_model.setup_test_data(test_data_config=cfg.model.data.test_ds) if torch.distributed.is_initialized(): torch.distributed.barrier() trainer.test(model=best_eval_model, ckpt_path=None, verbose=False) logging.info("Beset EVAL Testing finished!") logging.info("===========================================================================================") if cfg.model.nemo_path: # '.nemo' file contains the last checkpoint and the params to initialize the model best_eval_model.save_to(cfg.model.nemo_path) logging.info(f'Model is saved into `.nemo` file: {cfg.model.nemo_path}') # perform inference on a list of queries. if "infer_samples" in cfg.model and cfg.model.infer_samples: logging.info("===========================================================================================") logging.info("Starting the inference on some sample queries...") # max_seq_length=512 is the maximum length BERT supports. results = best_eval_model.cuda().ptune_inference( queries=cfg.model.infer_samples, batch_size=1, decode_token_len=5 ) logging.info('The prediction results of some sample queries with the trained model:') for query, result in zip(cfg.model.infer_samples, results): logging.info(f'Query : {query}') logging.info(f'Predicted label: {result}') logging.info("Inference finished!") logging.info("===========================================================================================")
def register_artifact(self, model, config_path: str, src: str, verify_src_exists: bool = True): """ Register model artifacts with this function. These artifacts (files) will be included inside .nemo file when model.save_to("mymodel.nemo") is called. How it works: 1. It always returns existing absolute path which can be used during Model constructor call EXCEPTION: src is None or "" in which case nothing will be done and src will be returned 2. It will add (config_path, model_utils.ArtifactItem()) pair to self.artifacts If "src" is local existing path, then it will be returned in absolute path form. elif "src" starts with "nemo_file:unique_artifact_name": .nemo will be untarred to a temporary folder location and an actual existing path will be returned else an error will be raised. WARNING: use .register_artifact calls in your models' constructors. The returned path is not guaranteed to exist after you have exited your model's constuctor. Args: model: ModelPT object to register artifact for. config_path (str): Artifact key. Usually corresponds to the model config. src (str): Path to artifact. verify_src_exists (bool): If set to False, then the artifact is optional and register_artifact will return None even if src is not found. Defaults to True. Returns: str: If src is not None or empty it always returns absolute path which is guaranteed to exists during model instnce life """ app_state = AppState() artifact_item = model_utils.ArtifactItem() # This is for backward compatibility, if the src objects exists simply inside of the tarfile # without its key having been overriden, this pathway will be used. src_obj_name = os.path.basename(src) if app_state.nemo_file_folder is not None: src_obj_path = os.path.abspath( os.path.join(app_state.nemo_file_folder, src_obj_name)) else: src_obj_path = src_obj_name # src is a local existing path - register artifact and return exact same path for usage by the model if os.path.exists(os.path.abspath(src)): return_path = os.path.abspath(src) artifact_item.path_type = model_utils.ArtifactPathType.LOCAL_PATH # this is the case when artifact must be retried from the nemo file # we are assuming that the location of the right nemo file is available from _MODEL_RESTORE_PATH elif src.startswith("nemo:"): return_path = os.path.abspath( os.path.join(app_state.nemo_file_folder, src[5:])) artifact_item.path_type = model_utils.ArtifactPathType.TAR_PATH # backward compatibility implementation elif os.path.exists(src_obj_path): return_path = src_obj_path artifact_item.path_type = model_utils.ArtifactPathType.TAR_PATH else: if verify_src_exists: raise FileNotFoundError( f"src path does not exist or it is not a path in nemo file. src value I got was: {src}. Absolute: {os.path.abspath(src)}" ) else: # artifact is optional and we simply return None return None assert os.path.exists(return_path) artifact_item.path = os.path.abspath(src) model.artifacts[config_path] = artifact_item # we were called by ModelPT if hasattr(model, "cfg"): with open_dict(model._cfg): OmegaConf.update(model.cfg, config_path, return_path) return return_path
def main(cfg) -> None: logging.info("\n\n************** Experiment configuration ***********") logging.info(f'\n{OmegaConf.to_yaml(cfg)}') megatron_amp_o2 = cfg.model.get('megatron_amp_O2', False) plugins = [ NLPDDPPlugin( no_ddp_communication_hook=True, gradient_as_bucket_view=cfg.model.gradient_as_bucket_view, find_unused_parameters=False, ) ] if cfg.trainer.precision in [16, 'bf16']: scaler = None if cfg.trainer.precision == 16: scaler = GradScaler( init_scale=cfg.model.get('native_amp_init_scale', 2**32), growth_interval=cfg.model.get('native_amp_growth_interval', 1000), hysteresis=cfg.model.get('hysteresis', 2), ) if megatron_amp_o2: plugins.append( MegatronHalfPrecisionPlugin(precision=cfg.trainer.precision, device='cuda', scaler=scaler)) else: plugins.append( PipelineMixedPrecisionPlugin(precision=cfg.trainer.precision, device='cuda', scaler=scaler)) if cfg.get('cluster_type', None) == 'BCP': plugins.append(TorchElasticEnvironment()) trainer = Trainer(plugins=plugins, **cfg.trainer, callbacks=[ModelSummary(max_depth=3)]) # tokenizers will be trained and and tarred training data will be created if needed # model config is then updated if cfg.model.preproc_out_dir is not None: MTDataPreproc(cfg=cfg.model, trainer=trainer) exp_manager(trainer, cfg.exp_manager) # update resume from checkpoint found by exp_manager if cfg.model.resume_from_checkpoint is not None: resume_from_checkpoint = cfg.model.resume_from_checkpoint else: resume_from_checkpoint = trainer._checkpoint_connector.resume_from_checkpoint_fit_path logging.info( f'Resuming training from checkpoint: {resume_from_checkpoint}') trainer._checkpoint_connector = CheckpointConnector( trainer, resume_from_checkpoint=resume_from_checkpoint) # Override timer callback to a stateless one for idx, callback in enumerate(trainer.callbacks): if isinstance(callback, Timer): trainer.callbacks[idx] = StatelessTimer(cfg.trainer.max_time, ) # hydra interpolation does not work here as the interpolation key is lost when PTL saves hparams with open_dict(cfg): cfg.model.precision = cfg.trainer.precision if hasattr(cfg.model, 'pretrained_model_path' ) and cfg.model.pretrained_model_path is not None: if not hasattr(cfg.model, 'pretrained_model_type'): raise ValueError(f"Pretrained model type must be in [T5, BART].") assert cfg.model.pretrained_model_type in ['T5', 'BART'] if cfg.model.pretrained_model_type == 'T5': pretrained_cfg = MegatronT5Model.restore_from( cfg.model.pretrained_model_path, trainer=trainer, return_config=True) else: pretrained_cfg = MegatronBARTModel.restore_from( cfg.model.pretrained_model_path, trainer=trainer, return_config=True) OmegaConf.set_struct(pretrained_cfg, True) with open_dict(pretrained_cfg): pretrained_cfg.masked_softmax_fusion = False # Set source and target language/multilingual pretrained_cfg.src_language = cfg.model.src_language pretrained_cfg.tgt_language = cfg.model.tgt_language pretrained_cfg.multilingual = cfg.model.multilingual pretrained_cfg.shared_tokenizer = True # Max generation delta pretrained_cfg.max_generation_delta = cfg.model.max_generation_delta # Set label smoothing pretrained_cfg.label_smoothing = cfg.model.label_smoothing # Set tokenizer paths: pretrained_cfg.encoder_tokenizer = pretrained_cfg.tokenizer pretrained_cfg.decoder_tokenizer = pretrained_cfg.tokenizer # Pre-trained models should use the legacy sentencepiece tokenizer ex: mT5 pretrained_cfg.encoder_tokenizer.sentencepiece_legacy = True pretrained_cfg.decoder_tokenizer.sentencepiece_legacy = True # Override dropout pretrained_cfg.hidden_dropout = cfg.model.hidden_dropout pretrained_cfg.attention_dropout = cfg.model.attention_dropout # Override precision pretrained_cfg.precision = cfg.model.precision # Set above from trainer.precision # Override data and global/micro batch size. pretrained_cfg.train_ds = cfg.model.train_ds pretrained_cfg.validation_ds = cfg.model.validation_ds pretrained_cfg.test_ds = cfg.model.test_ds pretrained_cfg.micro_batch_size = cfg.model.micro_batch_size pretrained_cfg.global_batch_size = cfg.model.global_batch_size # Class target for the new class being restored. pretrained_cfg.target = ( "nemo.collections.nlp.models.machine_translation.megatron_nmt_model.MegatronNMTModel" ) # Optimizer overrides. pretrained_cfg.optim = cfg.model.optim model = MegatronNMTModel.restore_from( cfg.model.pretrained_model_path, trainer=trainer, override_config_path=pretrained_cfg, save_restore_connector=NLPSaveRestoreConnector(), ) else: model = MegatronNMTModel(cfg.model, trainer) if cfg.do_training: trainer.fit(model) if cfg.do_testing: trainer.test(model)
def main(cfg) -> None: logging.info("\n\n************** Experiment configuration ***********") logging.info(f'\n{OmegaConf.to_yaml(cfg)}') megatron_amp_o2 = cfg.model.get('megatron_amp_O2', False) plugins = [ NLPDDPPlugin( no_ddp_communication_hook=True, gradient_as_bucket_view=cfg.model.gradient_as_bucket_view, find_unused_parameters=False, ) ] if cfg.trainer.precision in [16, 'bf16']: scaler = None if cfg.trainer.precision == 16: scaler = GradScaler( init_scale=cfg.model.get('native_amp_init_scale', 2 ** 32), growth_interval=cfg.model.get('native_amp_growth_interval', 1000), hysteresis=cfg.model.get('hysteresis', 2), ) if megatron_amp_o2: plugins.append(MegatronHalfPrecisionPlugin(precision=cfg.trainer.precision, device='cuda', scaler=scaler)) else: plugins.append(PipelineMixedPrecisionPlugin(precision=cfg.trainer.precision, device='cuda', scaler=scaler)) if cfg.get('cluster_type', None) == 'BCP': plugins.append(TorchElasticEnvironment()) trainer = Trainer(plugins=plugins, **cfg.trainer) exp_manager(trainer, cfg.exp_manager) # update resume from checkpoint found by exp_manager if cfg.model.resume_from_checkpoint is not None: resume_from_checkpoint = cfg.model.resume_from_checkpoint else: resume_from_checkpoint = trainer._checkpoint_connector.resume_from_checkpoint_fit_path logging.info(f'Resuming training from checkpoint: {resume_from_checkpoint}') trainer._checkpoint_connector = CheckpointConnector(trainer, resume_from_checkpoint=resume_from_checkpoint) # Override timer callback to a stateless one for idx, callback in enumerate(trainer.callbacks): if isinstance(callback, Timer): trainer.callbacks[idx] = StatelessTimer(cfg.trainer.max_time,) # Get the T5 Base configuration. t5_cfg = MegatronT5FinetuneModel.restore_from( restore_path=cfg.model.restore_from_path, trainer=trainer, return_config=True ) # Override the T5 configuration with the one from the config file. OmegaConf.set_struct(t5_cfg, True) with open_dict(t5_cfg): t5_cfg.masked_softmax_fusion = False t5_cfg.megatron_amp_O2 = cfg.model.get('megatron_amp_O2', False) t5_cfg.hidden_dropout = cfg.model.get('hidden_dropout', 0.1) t5_cfg.attention_dropout = cfg.model.get('attention_dropout', 0.1) t5_cfg.data = cfg.model.data t5_cfg.precision = cfg.trainer.precision t5_cfg.optim = cfg.model.optim t5_cfg.micro_batch_size = cfg.model.data.train_ds.micro_batch_size t5_cfg.global_batch_size = cfg.model.data.train_ds.global_batch_size # XNLI has eval languages in the yaml config. if hasattr(cfg.model, 'eval_languages'): t5_cfg.eval_languages = cfg.model.eval_languages if hasattr(cfg.model.data.train_ds, 'task_name'): model = MegatronT5GLUEModel.restore_from( restore_path=cfg.model.restore_from_path, trainer=trainer, override_config_path=t5_cfg, save_restore_connector=NLPSaveRestoreConnector(), ) else: model = MegatronT5FinetuneModel.restore_from( restore_path=cfg.model.restore_from_path, trainer=trainer, override_config_path=t5_cfg, save_restore_connector=NLPSaveRestoreConnector(), ) trainer.fit(model) trainer.validate(model) if hasattr(cfg.model.data, 'test_ds'): trainer.test(model)
def main(cfg) -> None: logging.info("\n\n************** Experiment configuration ***********") logging.info(f'\n{OmegaConf.to_yaml(cfg)}') megatron_amp_o2 = cfg.model.get('megatron_amp_O2', False) plugins = [ NLPDDPPlugin( no_ddp_communication_hook=True, gradient_as_bucket_view=cfg.model.gradient_as_bucket_view, find_unused_parameters=False, ) ] if cfg.trainer.precision in [16, 'bf16']: scaler = None if cfg.trainer.precision == 16: scaler = GradScaler( init_scale=cfg.model.get('native_amp_init_scale', 2**32), growth_interval=cfg.model.get('native_amp_growth_interval', 1000), hysteresis=cfg.model.get('hysteresis', 2), ) if megatron_amp_o2: plugins.append( MegatronHalfPrecisionPlugin(precision=cfg.trainer.precision, device='cuda', scaler=scaler)) else: plugins.append( PipelineMixedPrecisionPlugin(precision=cfg.trainer.precision, device='cuda', scaler=scaler)) if cfg.get('cluster_type', None) == 'BCP': plugins.append(TorchElasticEnvironment()) trainer = Trainer(plugins=plugins, **cfg.trainer) exp_manager(trainer, cfg.exp_manager) app_state = AppState() if cfg.model.tensor_model_parallel_size > 1 or cfg.model.pipeline_model_parallel_size > 1: app_state.model_parallel_size = cfg.model.tensor_model_parallel_size * cfg.model.pipeline_model_parallel_size ( app_state.tensor_model_parallel_rank, app_state.pipeline_model_parallel_rank, app_state.model_parallel_size, _, ) = fake_initialize_model_parallel( world_size=app_state.model_parallel_size, rank=trainer.global_rank, tensor_model_parallel_size_=cfg.model.tensor_model_parallel_size, pipeline_model_parallel_size_=cfg.model. pipeline_model_parallel_size, ) # Override timer callback to a stateless one for idx, callback in enumerate(trainer.callbacks): if isinstance(callback, Timer): trainer.callbacks[idx] = StatelessTimer(cfg.trainer.max_time, ) # hydra interpolation does not work here as the interpolation key is lost when PTL saves hparams with open_dict(cfg): cfg.model.precision = cfg.trainer.precision model = MegatronGPTModel.restore_from(cfg.restore_from_path, cfg.model, trainer=trainer) trainer.fit(model)
def update_cfg(cfg, key, val): OmegaConf.set_struct(cfg, True) with open_dict(cfg): setattr(cfg, key, val)
def legacy_model_config_to_new_model_config(model_cfg: DictConfig) -> DictConfig: """ Transform old style config into :class:`~nemo.collections.nlp.models.token_classification.punctuation_capitalization_config.PunctuationCapitalizationModelConfig`. Old style configs are configs which were used before ``common_dataset_parameters`` item was added. Old style datasets use ``dataset`` instead of ``common_dataset_parameters``, ``batch_size`` instead of ``tokens_in_batch``. Old style configs do not support tarred datasets. Args: model_cfg: old style config Returns: model config which follows dataclass :class:`~nemo.collections.nlp.models.token_classification.punctuation_capitalization_config.PunctuationCapitalizationModelConfig` """ train_ds = model_cfg.get('train_ds') validation_ds = model_cfg.get('validation_ds') test_ds = model_cfg.get('test_ds') dataset = model_cfg.dataset punct_head_config = model_cfg.get('punct_head', {}) capit_head_config = model_cfg.get('capit_head', {}) omega_conf = OmegaConf.structured( PunctuationCapitalizationModelConfig( class_labels=model_cfg.class_labels, common_dataset_parameters=CommonDatasetParametersConfig( pad_label=dataset.pad_label, ignore_extra_tokens=dataset.get( 'ignore_extra_tokens', CommonDatasetParametersConfig.ignore_extra_tokens ), ignore_start_end=dataset.get('ignore_start_end', CommonDatasetParametersConfig.ignore_start_end), punct_label_ids=model_cfg.punct_label_ids, capit_label_ids=model_cfg.capit_label_ids, ), train_ds=None if train_ds is None else legacy_data_config_to_new_data_config(train_ds, dataset, train=True), validation_ds=None if validation_ds is None else legacy_data_config_to_new_data_config(validation_ds, dataset, train=False), test_ds=None if test_ds is None else legacy_data_config_to_new_data_config(test_ds, dataset, train=False), punct_head=HeadConfig( num_fc_layers=punct_head_config.get('punct_num_fc_layers', HeadConfig.num_fc_layers), fc_dropout=punct_head_config.get('fc_dropout', HeadConfig.fc_dropout), activation=punct_head_config.get('activation', HeadConfig.activation), use_transformer_init=punct_head_config.get('use_transformer_init', HeadConfig.use_transformer_init), ), capit_head=HeadConfig( num_fc_layers=capit_head_config.get('capit_num_fc_layers', HeadConfig.num_fc_layers), fc_dropout=capit_head_config.get('fc_dropout', HeadConfig.fc_dropout), activation=capit_head_config.get('activation', HeadConfig.activation), use_transformer_init=capit_head_config.get('use_transformer_init', HeadConfig.use_transformer_init), ), tokenizer=model_cfg.tokenizer, language_model=model_cfg.language_model, optim=model_cfg.optim, ) ) with open_dict(omega_conf): retain_during_legacy_conversion = model_cfg.get('retain_during_legacy_conversion', {}) for key in retain_during_legacy_conversion.keys(): omega_conf[key] = retain_during_legacy_conversion[key] return omega_conf
def main(cfg) -> None: logging.info("\n\n************** Experiment configuration ***********") logging.info(f'\n{OmegaConf.to_yaml(cfg)}') megatron_amp_o2 = cfg.model.get('megatron_amp_O2', False) plugins = [ NLPDDPPlugin( no_ddp_communication_hook=True, gradient_as_bucket_view=cfg.model.gradient_as_bucket_view, find_unused_parameters=False, ) ] if cfg.trainer.precision in [16, 'bf16']: scaler = None if cfg.trainer.precision == 16: scaler = GradScaler( init_scale=cfg.model.get('native_amp_init_scale', 2**32), growth_interval=cfg.model.get('native_amp_growth_interval', 1000), hysteresis=cfg.model.get('hysteresis', 2), ) if megatron_amp_o2: plugins.append( MegatronHalfPrecisionPlugin(precision=cfg.trainer.precision, device='cuda', scaler=scaler)) else: plugins.append( NativeMixedPrecisionPlugin(precision=cfg.trainer.precision, device='cuda', scaler=scaler)) if cfg.get('cluster_type', None) == 'BCP': plugins.append(TorchElasticEnvironment()) trainer = Trainer(plugins=plugins, **cfg.trainer) exp_manager(trainer, cfg.exp_manager) # Override timer callback to a stateless one for idx, callback in enumerate(trainer.callbacks): if isinstance(callback, Timer): trainer.callbacks[idx] = StatelessTimer(cfg.trainer.max_time, ) # Get the T5 Base configuration. t5_cfg = MegatronT5GLUEModel.restore_from( restore_path=cfg.model.restore_from_path, trainer=trainer, return_config=True) # Override the T5 configuration with the one from the config file. # NOTE: Only data can be overriden here since this the file being restored here should already correspond to a GLUE/XNLI finetuned model. OmegaConf.set_struct(t5_cfg, True) with open_dict(t5_cfg): t5_cfg.masked_softmax_fusion = False t5_cfg.precision = cfg.trainer.precision # Overwrite data configs t5_cfg.data = cfg.model.data # XNLI has eval languages in the yaml config. if hasattr(cfg.model, 'eval_languages'): t5_cfg.eval_languages = cfg.model.eval_languages if hasattr(t5_cfg.data.validation_ds, 'task_name'): model = MegatronT5GLUEModel.restore_from( restore_path=cfg.model.restore_from_path, trainer=trainer, override_config_path=t5_cfg) else: model = MegatronT5FinetuneModel.restore_from( restore_path=cfg.model.restore_from_path, trainer=trainer, override_config_path=t5_cfg) model.freeze() trainer.validate(model) if hasattr(cfg.model.data, 'test_ds'): trainer.test(model)