def configure_optimizers(self): self.setup_optimization() # Wrap the baseline optimizer with the optimizer class with master parameters if self.megatron_amp_o2 and self._optimizer is not None: if self.cfg.precision == 'bf16': fp32_grad_accum = True contiguous_grad_bucket = True async_grad_allreduce = True elif self.cfg.precision == 16: fp32_grad_accum = False # TODO: contiguous grad bucket for fp16 is also planned to be supported contiguous_grad_bucket = False async_grad_allreduce = False self._optimizer = MainParamsOptimizerWrapper( self._optimizer, fp32_grad_accum=fp32_grad_accum, contiguous_grad_bucket=contiguous_grad_bucket, async_grad_allreduce=async_grad_allreduce, ) assert self._trainer.max_steps is not None, "'max_steps' is missing in trainer config." if hasattr(self._cfg.optim, 'sched'): sched_config = self._cfg.optim.sched sched_config['max_steps'] = self._trainer.max_steps self._scheduler = prepare_lr_scheduler( optimizer=self._optimizer, scheduler_config=sched_config, train_dataloader=self._train_dl ) if self._scheduler is None: return self._optimizer else: return [self._optimizer], [self._scheduler]
def configure_optimizers(self): self.setup_optimization() # Wrap the baseline optimizer with the optimizer class with master parameters if self.megatron_amp_o2 and self._optimizer is not None: if self.cfg.precision == 'bf16': fp32_grad_accum = True contiguous_grad_bucket = True elif self.cfg.precision == 16: fp32_grad_accum = False # TODO: contiguous grad bucket for fp16 is also planned to be supported contiguous_grad_bucket = False raise ValueError( "fp16 training is not yet supported with O2. Please set megatron_amp_O2 to False in the model config." ) # TODO: this should be true when not using pipeline parallelism # we will support that for bf16 when we have async handler from apex # and we will support it for fp16 when we have it implemented in the O2 recipe async_grad_allreduce = False self._optimizer = MainParamsOptimizerWrapper( self._optimizer, fp32_grad_accum=fp32_grad_accum, contiguous_grad_bucket=contiguous_grad_bucket, async_grad_allreduce=async_grad_allreduce, ) assert self._trainer.max_steps is not None, "'max_steps' is missing in trainer config." sched_config = self._cfg.optim.sched sched_config['max_steps'] = self._trainer.max_steps self._scheduler = prepare_lr_scheduler( optimizer=self._optimizer, scheduler_config=sched_config, train_dataloader=self._train_dl) if self._scheduler is None: return self._optimizer else: return [self._optimizer], [self._scheduler]
def configure_optimizers(self): self.setup_optimization() # Wrap the baseline optimizer with the optimizer class with master parameters if self.megatron_amp_o2 and self._optimizer is not None: if self.cfg.precision == 'bf16': fp32_grad_accum = True contiguous_grad_bucket = True elif self.cfg.precision == 16: fp32_grad_accum = False # TODO: contiguous grad bucket for fp16 is also planned to be supported contiguous_grad_bucket = False raise ValueError( "fp16 training is not yet supported with O2. Please set megatron_amp_O2 to False in the model config." ) # if using tensor parallel only, we can use async grad all-reduce if self.cfg.get('pipeline_model_parallel_size', 1) == 1: async_grad_allreduce = True else: async_grad_allreduce = False self._optimizer = MainParamsOptimizerWrapper( self._optimizer, fp32_grad_accum=fp32_grad_accum, contiguous_grad_bucket=contiguous_grad_bucket, async_grad_allreduce=async_grad_allreduce, grad_allreduce_chunk_size_mb=self.cfg.get( 'grad_allreduce_chunk_size_mb', 125), ) assert self._trainer.max_steps is not None, "'max_steps' is missing in trainer config." sched_config = self._cfg.optim.sched sched_config['max_steps'] = self._trainer.max_steps self._scheduler = prepare_lr_scheduler( optimizer=self._optimizer, scheduler_config=sched_config, train_dataloader=self._train_dl) if self._scheduler is None: return self._optimizer else: return [self._optimizer], [self._scheduler]
class MegatronLMEncoderDecoderModel(MegatronBaseModel): """ Megatron encoder-decoder base class """ def __init__(self, cfg: DictConfig, trainer: Trainer): super().__init__(cfg, trainer=trainer) # Make sure trainer.accumulate_grad_batches is 1. self._validate_trainer() # build tokenizer (defaults to nemo supported tokenizers) self._build_tokenizer() # manipulate vocabulary (e.g., pad vocabulary for better efficiency) self._build_vocab() # TODO: Not sure how to use lists of modules with PTL. # This means we can only use pipeline parallelism without the interleaved schedule. self.enc_dec_model = build_model( model_provider_func=self.model_provider_func, wrap_with_ddp=False, model_type=ModelType.encoder_and_decoder, )[0] self.setup_optimizer_param_groups() self.megatron_amp_o2 = cfg.get('megatron_amp_O2', False) if self.megatron_amp_o2: # Pre-allocate the model on GPU to have master parameters allocated on the same device with matching data type self.enc_dec_model.cuda(torch.cuda.current_device()) # Model wrapper to convert both model and inputs to half precision self.enc_dec_model = Float16Module(module=self.enc_dec_model, precision=cfg.precision) if self.cfg.precision == 32: self.autocast_dtype = torch.float elif self.cfg.precision == 16: self.autocast_dtype = torch.half elif self.cfg.precision == 'bf16': self.autocast_dtype = torch.bfloat16 else: raise ValueError('precision must be in [32, 16, "bf16"]') self.enc_dec_model.model_type = ModelType.encoder_and_decoder def _build_tokenizer(self): """ Default tokenizer is based on available nemo tokenizers. Override this method to use an external tokenizer. All tokenizers are expected to provide compatible interface. Override default Encoder-decoder tokenizer to use legacy=True for sentencepiece. """ self.tokenizer = get_nmt_tokenizer( library=self._cfg.tokenizer.library, model_name=self._cfg.tokenizer.type, tokenizer_model=self.register_artifact("tokenizer.model", self._cfg.tokenizer.model), vocab_file=self.register_artifact("tokenizer.vocab_file", self._cfg.tokenizer.vocab_file), merges_file=self.register_artifact("tokenizer.merge_file", self._cfg.tokenizer.merge_file), legacy=True if self._cfg.tokenizer.library == 'sentencepiece' else False, ) def _build_vocab(self): """ Manipulate vocabulary (e.g., pad vocabulary for increased performance)/ """ # TODO: add config to allow to disable it? self.padded_vocab_size = self._vocab_size_with_padding( orig_vocab_size=self.tokenizer.vocab_size, make_vocab_size_divisible_by=self._cfg.get('make_vocab_size_divisible_by', 128), tensor_model_parallel_size=self._cfg.get('tensor_model_parallel_size', 1), ) def model_provider_func(self, pre_process, post_process, add_encoder, add_decoder): # TODO: create get_encoder_decoder_model()here for different losses (e..g, nll, vae, mim) model = MegatronTokenLevelEncoderDecoderModule( encoder_arch=self.cfg.encoder_arch, decoder_arch=self.cfg.decoder_arch, vocab_size=self.padded_vocab_size, hidden_size=self.cfg.hidden_size, max_position_embeddings=self.cfg.max_position_embeddings, num_layers=self.cfg.num_layers, num_attention_heads=self.cfg.num_attention_heads, apply_query_key_layer_scaling=self.cfg.get('apply_query_key_layer_scaling', True), kv_channels=self.cfg.get('kv_channels', None), ffn_hidden_size=self.cfg.ffn_hidden_size, num_tokentypes=0, parallel_output=True, pre_process=pre_process, post_process=post_process, init_method_std=self.cfg.get('init_method_std', 0.02), fp16_cross_entropy=self.cfg.get('fp16_lm_cross_entropy', False), use_cpu_initialization=self.cfg.get('use_cpu_initialization', False), hidden_dropout=self.cfg.get('hidden_dropout', 0.1), attention_dropout=self.cfg.get('attention_dropout', 0.1), precision=self.cfg.get('precision', 16), fp32_residual_connection=self.cfg.get('fp32_residual_connection', False), activations_checkpoint_method=self.cfg.get('activations_checkpoint_method', None), activations_checkpoint_num_layers=self.cfg.get('activations_checkpoint_num_layers', 1), layernorm_epsilon=self.cfg.get('layernorm_epsilon', 1e-5), persist_layer_norm=self.cfg.get('persist_layer_norm', False), bias_gelu_fusion=self.cfg.get('bias_gelu_fusion', True), bias_dropout_add_fusion=self.cfg.get('bias_dropout_add_fusion', True), masked_softmax_fusion=self.cfg.get('masked_softmax_fusion', True), onnx_safe=self.cfg.get('onnx_safe', False), activation=self.cfg.get('activation', 'gelu'), bias=self.cfg.get('bias', True), normalization=self.cfg.get('normalization', 'layernorm'), transformer_block_type=self.cfg.get('transformer_block_type', 'pre_ln'), headscale=self.cfg.get('headscale', False), add_encoder=add_encoder, add_decoder=add_decoder, ) return model def forward( self, encoder_input_ids, decoder_input_ids, encoder_attn_mask, decoder_attn_mask, token_type_ids=None, lm_labels=None, enc_hidden_states=None, enc_output_mask=None, output_enc_hidden_only=False, enc_input=None, ): output_tensor = self.enc_dec_model( enc_input_ids=encoder_input_ids, dec_input_ids=decoder_input_ids, enc_attn_mask=encoder_attn_mask, dec_attn_mask=decoder_attn_mask, token_type_ids=token_type_ids, labels=lm_labels, enc_hidden_states=enc_hidden_states, enc_output_mask=enc_output_mask, output_enc_hidden_only=output_enc_hidden_only, enc_input=enc_input, ) return output_tensor def setup_optimizer_param_groups(self): """ModelPT override. Optimizer will get self._optimizer_param_groups""" self._optimizer_param_groups = get_params_for_weight_decay_optimization([self.enc_dec_model]) def training_step(self, batch, batch_idx): """ Our dataloaders produce a micro-batch and then we fetch a number of microbatches depending on the global batch size and model parallel size from the dataloader to produce a list of microbatches. Batch should be a list of microbatches and those microbatches should on CPU. Microbatches are then moved to GPU during the pipeline. The list of microbatches is then piped through the pipeline using Apex fwd/bwd functions. """ # we zero grads here because we also call backward in the apex fwd/bwd functions self._optimizer.zero_grad() # we prepare the micro batches for the apex fwd/bwd function batch_for_pipeline = self.process_global_batch(batch) encoder_seq_length = batch_for_pipeline[0].size(1) decoder_seq_length = batch_for_pipeline[1].size(1) tensor_shape = [encoder_seq_length, get_micro_batch_size(), self.cfg.hidden_size] if self.cfg.get('pipeline_model_parallel_size', 1) > 1: losses_reduced_per_micro_batch = forward_backward_pipelining_without_interleaving( forward_step_func=self.get_forward_output_and_loss_func(), batch=batch_for_pipeline, model=self.enc_dec_model, forward_only=False, tensor_shape=tensor_shape, decoder_sequence_length=decoder_seq_length, dtype=self.autocast_dtype, grad_scaler=self.trainer.precision_plugin.scaler if self.cfg.precision == 16 else None, ) else: losses_reduced_per_micro_batch = forward_backward_no_pipelining( forward_step_func=self.get_forward_output_and_loss_func(), batch=batch_for_pipeline, model=self.enc_dec_model, forward_only=False, tensor_shape=tensor_shape, decoder_sequence_length=decoder_seq_length, dtype=self.autocast_dtype, grad_scaler=self.trainer.precision_plugin.scaler if self.cfg.precision == 16 else None, ) # only the last stages of the pipeline return losses if losses_reduced_per_micro_batch: # average loss across micro batches loss_tensors_list = [loss_reduced['avg'] for loss_reduced in losses_reduced_per_micro_batch] loss_tensor = torch.concat(loss_tensors_list) loss_mean = loss_tensor.mean() else: loss_mean = torch.tensor(0.0).cuda() # TODO: if we're not using pipeline, then we should do async allreduce (better perf) # in order to do this with O2, we need the async handler to be added to apex fwd/bwd function if self.megatron_amp_o2: # main grads are stored in the MainParamsOptimizer wrapper self._optimizer.allreduce_main_grads() # @sangkug we think this is fine self.allreduce_word_and_position_embeddings() else: self.allreduce_gradients() # @sangkug we think this is causing memory to blow up (hurts perf) self.allreduce_word_and_position_embeddings() ## logging # we can only log on one rank if it is rank zero so we broadcast from last rank # we can avoid this broadcast by updating the PTL log function to accept specific ranks torch.distributed.broadcast(loss_mean, get_last_rank()) if self.cfg.precision == 16: loss_scale = self.trainer.precision_plugin.scaler._scale if loss_scale is not None: self.log('loss_scale', loss_scale) self.log('reduced_train_loss', loss_mean, prog_bar=True, rank_zero_only=True) lr = self._optimizer.param_groups[0]['lr'] self.log('lr', lr, rank_zero_only=True) self.log('global_step', self.trainer.global_step, prog_bar=True, rank_zero_only=True) # TODO: make sure compute_consumed_samples works for pipeline parallelism self.log( 'consumed_samples', self.compute_consumed_samples(self.trainer.global_step - self.init_global_step), prog_bar=True, rank_zero_only=True, ) return loss_mean def on_train_batch_end(self, outputs, batch, batch_idx: int, unused: Optional[int] = 0) -> None: super().on_train_batch_end(outputs, batch, batch_idx) # TODO: Replace with newer override for scheduler.step() instead of # search for plugins for fp16 GradScalar if self.trainer.precision_plugin is not None and isinstance( self.trainer.precision_plugin, NativeMixedPrecisionPlugin ): precision_plugin = self.trainer.precision_plugin if ( hasattr(precision_plugin, 'scaler') and precision_plugin.scaler is not None and isinstance(precision_plugin.scaler, GradScaler) ): grad_scaler = precision_plugin.scaler # If the grad scaler skipped its optimizer step due to infs/nans, # decrement the step of all schedulers. if grad_scaler.optimizer_update_skipped is not None and grad_scaler.optimizer_update_skipped is True: schedulers = self.trainer.lr_schedulers if not schedulers or not self.trainer.lightning_module.automatic_optimization: return for scheduler in schedulers: # Decrement the counter by 2, then perform a scheduler.step() to perform a no-up # as well as update the optimizer lr in all param groups scheduler['scheduler'].last_epoch -= 2 scheduler['scheduler'].step() # Increase the max step count by 1 self.trainer.fit_loop.max_steps = self.trainer.fit_loop.max_steps + 1 # Reset the optimizer update skipped to `None` - this is to prevent scheduler no-ops during # accumulated gradient updates. grad_scaler.optimizer_update_skipped = None def backward(self, *args, **kwargs): """ LightningModule hook to do backward. We want this to do nothing since we run backward in the fwd/bwd functions from apex. No need to call it here. """ return def optimizer_zero_grad(self, *args, **kwargs): """ LightningModule hook to zero grad. We want this to do nothing as we are zeroing grads during the training_step. """ return def allreduce_gradients(self): """Reduce gradients across data parallel ranks. Modified from megatron-lm: https://github.com/NVIDIA/Megatron-LM/blob/d41696840ed0a7edb7e0499eb82a48ae112d9bb3/megatron/model/distributed.py#L188 """ # Bucketize and all-reduce buckets = {} # Pack the buckets. for param in self.parameters(): if param.requires_grad and param.grad is not None: tp = param.data.type() if tp not in buckets: buckets[tp] = [] buckets[tp].append(param) # param.main_grad = param.grad # For each bucket, all-reduce and copy all-reduced grads. for tp in buckets: bucket = buckets[tp] grads = [param.grad.data for param in bucket] coalesced = torch._utils._flatten_dense_tensors(grads) coalesced /= parallel_state.get_data_parallel_world_size() torch.distributed.all_reduce(coalesced, group=parallel_state.get_data_parallel_group()) for buf, synced in zip(grads, torch._utils._unflatten_dense_tensors(coalesced, grads)): buf.copy_(synced) def allreduce_word_and_position_embeddings(self): # Modified from megatron-lm: https://github.com/NVIDIA/Megatron-LM/blob/d41696840ed0a7edb7e0499eb82a48ae112d9bb3/megatron/training.py#L407 # All-reduce word_embeddings' grad across first and last stages to ensure # that word_embeddings parameters stay in sync. # This should only run for models that support pipelined model parallelism # (BERT and GPT-2). if parallel_state.get_pipeline_model_parallel_world_size() > 1 and ( parallel_state.is_rank_in_embedding_group() ): if self.enc_dec_model.share_word_embeddings: word_embeddings_weight = self.enc_dec_model.word_embeddings_weight() if self.megatron_amp_o2: # O2 recipe stores a "main" copy of weights and grads grad = word_embeddings_weight.main_grad else: grad = word_embeddings_weight.grad torch.distributed.all_reduce(grad, group=parallel_state.get_embedding_group()) # All reduce position embeddings for T5. if ( parallel_state.is_rank_in_position_embedding_group() and parallel_state.get_pipeline_model_parallel_world_size() > 1 and parallel_state.get_pipeline_model_parallel_split_rank() is not None ): position_embeddings_weight = self.enc_dec_model.position_embeddings_weight() if self.megatron_amp_o2: grad = position_embeddings_weight.main_grad else: grad = position_embeddings_weight.grad torch.distributed.all_reduce(grad, group=parallel_state.get_position_embedding_group()) def get_forward_output_and_loss_func(self): def fwd_output_and_loss_func(batch, model): batch = [x.cuda(non_blocking=True) for x in batch] encoder_input_ids, decoder_input_ids, loss_mask, lm_labels, encoder_attn_mask, decoder_attn_mask = batch output = model( encoder_input_ids, # enc_input_ids encoder_attn_mask, # enc_attn_mask decoder_input_ids, # dec_input_ids decoder_attn_mask, # dec_attn_mask None, # token_type_ids lm_labels, # labels None, # enc_hidden_states ) def loss_func(output_tensor): loss = self.loss_func(loss_mask, output_tensor) reduced_loss = average_losses_across_data_parallel_group([loss]) return loss, {'avg': reduced_loss} return output, loss_func return fwd_output_and_loss_func def get_forward_output_only_func(self): def fwd_output_only_func(batch, model): batch = [x.cuda(non_blocking=True) for x in batch] if len(batch) == 4: encoder_input_ids, decoder_input_ids, encoder_attn_mask, decoder_attn_mask = batch enc_input = None elif len(batch) == 5: encoder_input_ids, decoder_input_ids, encoder_attn_mask, decoder_attn_mask, enc_input = batch else: raise ValueError("wrong number of items in the batch") output = model( encoder_input_ids, # enc_input_ids encoder_attn_mask, # enc_attn_mask decoder_input_ids, # dec_input_ids decoder_attn_mask, # dec_attn_mask None, # token_type_ids None, # labels enc_input, # enc_hidden_states ) def id_func(output_tensor): return output_tensor, {'logits': output_tensor} return output, id_func return fwd_output_only_func def validation_step(self, batch, batch_idx): batch_for_pipeline = self.process_global_batch(batch) encoder_seq_length = batch_for_pipeline[0].size(1) decoder_seq_length = batch_for_pipeline[1].size(1) tensor_shape = [encoder_seq_length, get_micro_batch_size(), self.cfg.hidden_size] if self.cfg.get('pipeline_model_parallel_size', 1) > 1: losses_reduced_per_micro_batch = forward_backward_pipelining_without_interleaving( forward_step_func=self.get_forward_output_and_loss_func(), batch=batch_for_pipeline, model=self.enc_dec_model, forward_only=True, tensor_shape=tensor_shape, decoder_sequence_length=decoder_seq_length, dtype=self.autocast_dtype, grad_scaler=self.trainer.precision_plugin.scaler if self.cfg.precision == 16 else None, ) else: losses_reduced_per_micro_batch = forward_backward_no_pipelining( forward_step_func=self.get_forward_output_and_loss_func(), batch=batch_for_pipeline, model=self.enc_dec_model, forward_only=True, tensor_shape=tensor_shape, decoder_sequence_length=decoder_seq_length, dtype=self.autocast_dtype, grad_scaler=self.trainer.precision_plugin.scaler if self.cfg.precision == 16 else None, ) if losses_reduced_per_micro_batch: # average loss across micro batches loss_tensors_list = [loss_reduced['avg'] for loss_reduced in losses_reduced_per_micro_batch] loss_tensor = torch.concat(loss_tensors_list) loss_mean = loss_tensor.mean() else: # we're not on the last pipeline stage so no losses loss_mean = [] return loss_mean def validation_epoch_end(self, outputs): if not outputs: return if parallel_state.is_pipeline_last_stage(): # only the last pipeline parallel stages return loss averaged_loss = torch.stack(outputs).mean() else: averaged_loss = torch.tensor(0.0).cuda() # we can only log on one rank if it is rank zero so we broadcast from last rank torch.distributed.broadcast(averaged_loss, get_last_rank()) self.log('val_loss', averaged_loss, prog_bar=True, rank_zero_only=True) self.log( 'consumed_samples', self.compute_consumed_samples(self.trainer.global_step - self.init_global_step), rank_zero_only=True, ) return averaged_loss def test_step(self, batch, batch_idx): return self.validation_step(batch, batch_idx) def test_epoch_end(self, outputs): if not outputs: return if parallel_state.is_pipeline_last_stage(): # only the last pipeline parallel stages return loss averaged_loss = torch.stack(outputs).mean() else: averaged_loss = torch.tensor(0.0).cuda() # we can only log on one rank if it is rank zero so we broadcast from last rank torch.distributed.broadcast(averaged_loss, get_last_rank()) self.log('test_loss', averaged_loss, prog_bar=True, rank_zero_only=True) self.log( 'consumed_samples', self.compute_consumed_samples(self.trainer.global_step - self.init_global_step), rank_zero_only=True, ) return averaged_loss def loss_func(self, loss_mask, tokens_loss): """ This function takes as input per-token loss and masks non-required values. """ losses = tokens_loss.view(-1).float() loss_mask = loss_mask.view(-1).float() # TODO: add nemo version here loss = torch.sum(losses * loss_mask) / loss_mask.sum() # sequence level nll return loss def process_micro_batch(self, micro_batch): """ Micro batch returned by MegatronT5 dataloader""" data_b = micro_batch # Unpack. tokens_enc = data_b['text_enc'].long() tokens_dec = data_b['text_dec'].long() labels = data_b['labels'].long() loss_mask = data_b['loss_mask'].float() enc_mask = data_b['enc_mask'] dec_mask = data_b['dec_mask'] return tokens_enc, tokens_dec, loss_mask, labels, enc_mask, dec_mask def process_global_batch(self, global_batch): return [ global_batch["text_enc"], global_batch["text_dec"], global_batch["loss_mask"], global_batch["labels"], global_batch["enc_mask"], global_batch["dec_mask"], ] def build_train_valid_test_datasets(self): raise NotImplementedError("Please implement this method in child-class") def build_pretraining_data_loader(self, dataset, consumed_samples): """Buld dataloader given an input dataset.""" if dataset is None: return None logging.info(f'Building dataloader with consumed samples: {consumed_samples}') # Megatron sampler if hasattr(self._cfg.data, 'dataloader_type') and self._cfg.data.dataloader_type is not None: if self._cfg.data.dataloader_type == 'single': batch_sampler = MegatronPretrainingBatchSampler( total_samples=len(dataset), consumed_samples=consumed_samples, micro_batch_size=self._cfg.micro_batch_size, global_batch_size=self._cfg.global_batch_size, data_parallel_rank=parallel_state.get_data_parallel_rank(), data_parallel_size=parallel_state.get_data_parallel_world_size(), drop_last=self._cfg.get('drop_last', True), ) elif self._cfg.data.dataloader_type == 'cyclic': batch_sampler = MegatronPretrainingRandomBatchSampler( total_samples=len(dataset), consumed_samples=consumed_samples, micro_batch_size=self._cfg.micro_batch_size, global_batch_size=self._cffg.global_batch_size, data_parallel_rank=parallel_state.get_data_parallel_rank(), data_parallel_size=parallel_state.get_data_parallel_world_size(), drop_last=self._cfg.get('drop_last', True), ) else: raise Exception(f'{self._cfg.dataloader_type} dataloader type is not supported.') else: raise ValueError('cfg.data.dataloader_type not found. Must be "single" or "cyclic"') # Torch dataloader. return torch.utils.data.DataLoader( dataset, batch_sampler=batch_sampler, num_workers=self._cfg.data.num_workers, pin_memory=True, ) def setup(self, stage=None): resume_checkpoint_path = self.trainer._checkpoint_connector.resume_from_checkpoint_fit_path if resume_checkpoint_path: try: init_consumed_samples = int( float(re.findall(r"consumed_samples\=([0-9]+.[0-9]+)", resume_checkpoint_path)[0]) ) except (ValueError, TypeError): logging.warning("Cannot parse the checkpoint file to get the consumed samples. assume it is zero.") init_consumed_samples = 0 else: init_consumed_samples = 0 self.init_consumed_samples = init_consumed_samples """A PTL method to setup the training, validation and test datasets.""" if stage == 'predict': return if self._train_dl is not None and self._validation_dl is not None: return self.build_train_valid_test_datasets() self.setup_training_data(self._cfg.data) self.setup_validation_data(self._cfg.data) self.setup_test_data(self._cfg.data) # when using pipeline model parallel the final stage need to initialize word embeddings if parallel_state.get_pipeline_model_parallel_world_size() > 1: self.enc_dec_model.sync_initial_word_embeddings() self.enc_dec_model.sync_initial_position_embeddings() def setup_training_data(self, cfg): if hasattr(self, '_train_ds'): consumed_samples = self.compute_consumed_samples(0) self._train_dl = self.build_pretraining_data_loader(self._train_ds, consumed_samples) def on_pretrain_routine_start(self) -> None: # keep a copy of init_global_step self.init_global_step = self.trainer.global_step return super().on_pretrain_routine_start() def setup_validation_data(self, cfg): if hasattr(self, '_validation_ds'): consumed_samples = 0 self._validation_dl = self.build_pretraining_data_loader(self._validation_ds, consumed_samples) def setup_test_data(self, cfg): if hasattr(self, '_test_ds'): consumed_samples = 0 self._test_dl = self.build_pretraining_data_loader(self._test_ds, consumed_samples) def configure_optimizers(self): self.setup_optimization() # Wrap the baseline optimizer with the optimizer class with master parameters if self.megatron_amp_o2 and self._optimizer is not None: if self.cfg.precision == 'bf16': fp32_grad_accum = True contiguous_grad_bucket = True elif self.cfg.precision == 16: fp32_grad_accum = False # TODO: contiguous grad bucket for fp16 is also planned to be supported contiguous_grad_bucket = False # TODO: this should be true when not using pipeline parallelism # we will support that for bf16 when we have async handler from apex # and we will support it for fp16 when we have it implemented in the O2 recipe async_grad_allreduce = False self._optimizer = MainParamsOptimizerWrapper( self._optimizer, fp32_grad_accum=fp32_grad_accum, contiguous_grad_bucket=contiguous_grad_bucket, async_grad_allreduce=async_grad_allreduce, ) assert self._trainer.max_steps is not None, "'max_steps' is missing in trainer config." if hasattr(self._cfg.optim, 'sched'): sched_config = self._cfg.optim.sched sched_config['max_steps'] = self._trainer.max_steps self._scheduler = prepare_lr_scheduler( optimizer=self._optimizer, scheduler_config=sched_config, train_dataloader=self._train_dl ) if self._scheduler is None: return self._optimizer else: return [self._optimizer], [self._scheduler] def get_parameters(self): params = [] for param_group in self._optimizer_param_groups: for param in param_group['params']: params.append(param) return params def compute_consumed_samples(self, steps_since_resume=0): app_state = AppState() consumed_samples = ( self.init_consumed_samples + steps_since_resume * app_state.data_parallel_size * self.cfg.micro_batch_size * get_num_microbatches() ) return int(consumed_samples) def configure_gradient_clipping(self, *args, **kwargs): """PTL hook to configure gradients. We use gradient clipping implementation from megatron-lm. """ clip_val = self.trainer.gradient_clip_val if clip_val is None: return clip_val = float(clip_val) if clip_val <= 0: return if self.megatron_amp_o2: # grep fp32 master parameters for gradient clipping parameters = self._optimizer.get_parameters() else: parameters = self.get_parameters() grad_norm = clip_grad_norm_fp32(parameters=parameters, max_norm=clip_val) self.log('grad_norm', grad_norm, rank_zero_only=True) def predict_step(self, batch: Any, batch_idx: int, dataloader_idx: Optional[int] = None) -> Any: request = batch response = self.complete(request) logging.info(f"response: {response}") return response def decode(self, tokens_enc, enc_mask, num_tokens_to_generate, encoder_input=None): app_state = AppState() global_batch_per_gpu = tokens_enc.size(0) num_micro_batches_before_decode = get_num_microbatches() # Reconfigure microbatch calculator here to set num microbatches to 1 while decoding since its not clear how to decode with "grad acc". # TODO: reconfigure back to how things were before decode? _reconfigure_microbatch_calculator( rank=app_state.global_rank, rampup_batch_size=None, global_batch_size=global_batch_per_gpu * parallel_state.get_data_parallel_world_size(), micro_batch_size=global_batch_per_gpu, # Make sure that there is no "grad acc" while decoding. data_parallel_size=parallel_state.get_data_parallel_world_size(), ) predicted_tokens_dec = ( torch.LongTensor([self.tokenizer.bos_id] * global_batch_per_gpu).unsqueeze(1).to(tokens_enc.device) ) encoder_seq_length = tokens_enc.size(1) tensor_shape = [encoder_seq_length, global_batch_per_gpu, self.cfg.hidden_size] assert predicted_tokens_dec.size(0) == global_batch_per_gpu for i in range(num_tokens_to_generate): # No microbatches in decoding. Just the global batch. decoder_seq_length = predicted_tokens_dec.size(1) dec_mask = predicted_tokens_dec != self.tokenizer.pad_id if encoder_input is not None: batch_for_pipeline = [tokens_enc, predicted_tokens_dec, enc_mask, dec_mask, encoder_input] else: batch_for_pipeline = [tokens_enc, predicted_tokens_dec, enc_mask, dec_mask] if self.cfg.get('pipeline_model_parallel_size', 1) > 1: output_tensor = forward_backward_pipelining_without_interleaving( forward_step_func=self.get_forward_output_only_func(), batch=batch_for_pipeline, model=self.enc_dec_model, forward_only=True, tensor_shape=tensor_shape, decoder_sequence_length=decoder_seq_length, dtype=self.autocast_dtype, ) else: output_tensor = forward_backward_no_pipelining( forward_step_func=self.get_forward_output_only_func(), batch=batch_for_pipeline, model=self.enc_dec_model, forward_only=True, tensor_shape=tensor_shape, decoder_sequence_length=decoder_seq_length, dtype=self.autocast_dtype, ) # get output tensor if parallel_state.is_pipeline_last_stage(): output_tensor = output_tensor[0]['logits'] output_tensor = tensor_parallel.gather_from_tensor_model_parallel_region(output_tensor) log_probs, token_ids = torch.max(torch.nn.functional.log_softmax(output_tensor, dim=-1), dim=-1) predicted_tokens_dec = torch.cat( [predicted_tokens_dec.to(token_ids.device), token_ids[:, -1].unsqueeze(1)], dim=1 ) else: log_probs = torch.zeros( (predicted_tokens_dec.shape[0], predicted_tokens_dec.shape[1]), dtype=self.autocast_dtype ).cuda() predicted_tokens_dec = torch.zeros( (predicted_tokens_dec.shape[0], predicted_tokens_dec.shape[1] + 1), dtype=predicted_tokens_dec.dtype, ).cuda() if self.cfg.get('pipeline_model_parallel_size', 1) > 1: # Broadcast from the last pipeline stage to all other model-parallel ranks. torch.distributed.broadcast( predicted_tokens_dec, parallel_state.get_pipeline_model_parallel_last_rank(), group=parallel_state.get_model_parallel_group(), ) torch.distributed.broadcast( log_probs, parallel_state.get_pipeline_model_parallel_last_rank(), group=parallel_state.get_model_parallel_group(), ) # Reset microbatch calculator to what it was before decoding. _reconfigure_microbatch_calculator( rank=app_state.global_rank, rampup_batch_size=None, global_batch_size=global_batch_per_gpu * parallel_state.get_data_parallel_world_size(), micro_batch_size=global_batch_per_gpu // num_micro_batches_before_decode, data_parallel_size=parallel_state.get_data_parallel_world_size(), ) return predicted_tokens_dec, log_probs def complete(self, request: Dict): """ Autoregressively invokes language model in the inference mode Args: request: Dictionary with the following fields * prompt: a string which text the model should complete. * tokens_to_generate: how many tokens to generate while doing prompt completion. Returns: response: A python dictionary with the following fields * prompt: original text of the prompt * tokenized_prompt: list of (str) tokens from prompt * completion: a python dictionary with the following subfields: * tokens: a list of triples (token, token_id, log_prob) comprising completion * text: completion text (as a single string) """ app_state = AppState() # The complete method only works with global batch = micro batch size = data parallel size = 1. _reconfigure_microbatch_calculator( rank=app_state.global_rank, rampup_batch_size=None, global_batch_size=1, micro_batch_size=1, data_parallel_size=1, ) app_state = AppState() response = {} self.freeze() # naive greedy slow loop # TODO: add option for BeamSearchDecoder response['prompt'] = request['prompt'][0] response['completion'] = {} tokens_enc = request['masked_sample'] response['masked_input'] = ' '.join(self.tokenizer.ids_to_tokens(tokens_enc[0].cpu().numpy().tolist())) enc_mask = tokens_enc != self.tokenizer.pad_id predicted_tokens_ids, log_probs = self.decode(tokens_enc, enc_mask, int(request['tokens_to_generate'])) predicted_tokens_ids = predicted_tokens_ids.cpu().numpy()[0].tolist() log_probs = log_probs.cpu().numpy()[0].tolist() if self.tokenizer.eos_id in predicted_tokens_ids: idx = predicted_tokens_ids.index(self.tokenizer.eos_id) predicted_tokens_ids = predicted_tokens_ids[:idx] else: predicted_tokens_ids = [id for id in predicted_tokens_ids if id != self.tokenizer.pad_id] if self.tokenizer.eos_id in predicted_tokens_ids: idx = predicted_tokens_ids.index(self.tokenizer.eos_id) predicted_tokens_ids = predicted_tokens_ids[:idx] # Legacy sentencepiece detokenization still preserves special tokens which messes up exact string match. if hasattr(self.tokenizer, 'special_token_to_id'): predicted_tokens_ids = [ id for id in predicted_tokens_ids if id not in self.tokenizer.special_token_to_id.values() ] predicted_tokens_dec = self.tokenizer.ids_to_tokens(predicted_tokens_ids) response['completion']['text'] = self.tokenizer.tokens_to_text(predicted_tokens_dec) response['completion']['tokens'] = list(zip(predicted_tokens_ids, predicted_tokens_dec, log_probs)) self.unfreeze() return response def _vocab_size_with_padding(self, orig_vocab_size, make_vocab_size_divisible_by, tensor_model_parallel_size): """Pad vocab size so it is divisible by model parallel size and still having GPU friendly size.""" after = orig_vocab_size multiple = make_vocab_size_divisible_by * tensor_model_parallel_size while (after % multiple) != 0: after += 1 logging.info( f'Padded vocab_size: {after}, original vocab_size: {orig_vocab_size}, dummy tokens: {after - orig_vocab_size}.' ) return after def _enable_nvidia_optimizations(self): "These optimizations are present in NVIDIA NGC PyTorch Containers" # Version check nvidia_torch_version = os.getenv('NVIDIA_PYTORCH_VERSION', None) if nvidia_torch_version is not None: NVIDIA_TORCH_MAJOR = int(nvidia_torch_version.split('.')[0]) NVIDIA_TORCH_MINOR = int(nvidia_torch_version.split('.')[1]) # Apex Persistent layer norm is supported from Nvidia PyTorch container v21.11 if NVIDIA_TORCH_MAJOR < 21 or (NVIDIA_TORCH_MAJOR == 21 and NVIDIA_TORCH_MINOR < 11): self._cfg.persist_layer_norm = False if NVIDIA_TORCH_MAJOR >= 21 or (NVIDIA_TORCH_MAJOR == 21 and NVIDIA_TORCH_MINOR >= 11): # NVFUSER torch._C._jit_set_profiling_executor(True) torch._C._jit_set_profiling_mode(True) torch._C._jit_override_can_fuse_on_cpu(False) torch._C._jit_override_can_fuse_on_gpu(False) torch._C._jit_set_texpr_fuser_enabled(False) torch._C._jit_set_nvfuser_enabled(True) torch._C._debug_set_autodiff_subgraph_inlining(False) else: # Not a Nvidia container. Dependency check is on users pass def transfer_batch_to_device(self, batch: Any, device: torch.device, dataloader_idx: int) -> Any: """ PTL hook: https://pytorch-lightning.readthedocs.io/en/latest/common/lightning_module.html#transfer-batch-to-device When using pipeline parallelism, we need the global batch to remain on the CPU, since the memory overhead will be too high when using a large number of microbatches. Microbatches are transferred from CPU to GPU inside the pipeline. """ return batch def _validate_trainer(self): """ Certain trainer configurations can break training. Here we try to catch them and raise an error. """ if self.trainer.accumulate_grad_batches > 1: raise ValueError( f'Gradient accumulation is done within training_step. trainer.accumulate_grad_batches must equal 1' ) def list_available_models(self): pass
class MegatronGPTModel(NLPModel, TextGeneration): """ Megatron GPT pretraining """ def __init__(self, cfg: DictConfig, trainer: Trainer): if not HAVE_APEX: raise ImportError( "Apex was not found. Please see the NeMo README for installation instructions: https://github.com/NVIDIA/NeMo#megatron-gpt." ) # this prevents base constructor from initializing tokenizer self.tokenizer = None super().__init__(cfg, trainer=trainer, no_lm_init=True) self._validate_trainer() # used in NVIDIA NGC PyTorch containers self._enable_nvidia_optimizations() if self.cfg.get('use_cpu_initialization', False) is False: torch.cuda.set_device(trainer.local_rank) initialize_model_parallel_for_nemo( world_size=trainer.world_size, global_rank=trainer.global_rank, local_rank=trainer.local_rank, tensor_model_parallel_size=cfg.get('tensor_model_parallel_size', 1), pipeline_model_parallel_size=cfg.get( 'pipeline_model_parallel_size', 1), micro_batch_size=cfg.get('micro_batch_size'), global_batch_size=cfg.get('global_batch_size'), seed=self.cfg.get('seed', 1234), apex_transformer_log_level=self.cfg.get( 'apex_transformer_log_level', 30), ) self.tokenizer = get_nmt_tokenizer( library=self.cfg.tokenizer.library, model_name=self.cfg.tokenizer.type, tokenizer_model=self.register_artifact("tokenizer.model", self.cfg.tokenizer.model), vocab_file=self.register_artifact("tokenizer.vocab_file", self.cfg.tokenizer.vocab_file), merges_file=self.register_artifact("tokenizer.merge_file", self.cfg.tokenizer.merge_file), delimiter=self.cfg.tokenizer.get('delimiter', None), ) vocab_size = self.tokenizer.vocab_size self.padded_vocab_size = self._vocab_size_with_padding( orig_vocab_size=vocab_size, make_vocab_size_divisible_by=cfg.get( 'make_vocab_size_divisible_by', 128), tensor_model_parallel_size=cfg.get('tensor_model_parallel_size', 1), ) # TODO: Not sure how to use lists of modules with PTL. # This means we can only use pipeline parallelism without the interleaved schedule. self.model = build_model(model_provider_func=self.model_provider_func, wrap_with_ddp=False)[0] self.setup_optimizer_param_groups() self.megatron_amp_o2 = cfg.get('megatron_amp_O2', False) if self.megatron_amp_o2: # Pre-allocate the model on GPU to have master parameters allocated on the same device with matching data type self.model.cuda(torch.cuda.current_device()) # Model wrapper to convert both model and inputs to half precision self.model = Float16Module(module=self.model, precision=cfg.precision) if self.trainer.precision == 32: self.autocast_dtype = torch.float elif self.trainer.precision == 16: self.autocast_dtype = torch.half elif self.trainer.precision == 'bf16': self.autocast_dtype = torch.bfloat16 else: raise ValueError('precision must be in [32, 16, "bf16"]') # configuration used for inference self._inference_config = None def set_inference_config(self, inference_config): self._inference_config = inference_config def get_inference_config(self): return self._inference_config def model_provider_func(self, pre_process, post_process): """Model depends on pipeline paralellism.""" model = GPTModel( vocab_size=self.padded_vocab_size, hidden_size=self.cfg.hidden_size, max_position_embeddings=self.cfg.max_position_embeddings, num_layers=self.cfg.num_layers, num_attention_heads=self.cfg.num_attention_heads, apply_query_key_layer_scaling=self.cfg.get( 'apply_query_key_layer_scaling', True), kv_channels=self.cfg.get('kv_channels', None), ffn_hidden_size=self.cfg.ffn_hidden_size, num_tokentypes=0, parallel_output=True, pre_process=pre_process, post_process=post_process, init_method_std=self.cfg.get('init_method_std', 0.02), fp16_lm_cross_entropy=self.cfg.get('fp16_lm_cross_entropy', False), use_cpu_initialization=self.cfg.get('use_cpu_initialization', False), hidden_dropout=self.cfg.get('hidden_dropout', 0.1), precision=self.cfg.get('precision', 16), fp32_residual_connection=self.cfg.get('fp32_residual_connection', False), activations_checkpoint_method=self.cfg.get( 'activations_checkpoint_method', None), activations_checkpoint_num_layers=self.cfg.get( 'activations_checkpoint_num_layers', 1), layernorm_epsilon=self.cfg.get('layernorm_epsilon', 1e-5), onnx_safe=self.cfg.get('onnx_safe', False), persist_layer_norm=self.cfg.get('persist_layer_norm', False), ) return model def forward(self, tokens, text_position_ids, attention_mask, labels): output_tensor = self.model(tokens, text_position_ids, attention_mask, labels=labels) return output_tensor def setup_optimizer_param_groups(self): """ModelPT override. Optimizer will get self._optimizer_param_groups""" self._optimizer_param_groups = get_params_for_weight_decay_optimization( [self.model]) def training_step(self, batch, batch_idx): """ Our dataloaders produce a micro-batch and then we fetch a number of microbatches depending on the global batch size and model parallel size from the dataloader to produce a list of microbatches. Batch should be a list of microbatches and those microbatches should on CPU. Microbatches are then moved to GPU during the pipeline. The list of microbatches is then piped through the pipeline using Apex fwd/bwd functions. """ # we zero grads here because we also call backward in the apex fwd/bwd functions self._optimizer.zero_grad() # we prepare the micro batches for the apex fwd/bwd function batch_for_pipeline = self.process_global_batch(batch) tensor_shape = [ self.cfg.encoder_seq_length, self.cfg.micro_batch_size, self.cfg.hidden_size ] if self.cfg.get('pipeline_model_parallel_size', 1) > 1: losses_reduced_per_micro_batch = forward_backward_pipelining_without_interleaving( forward_step_func=self.get_forward_output_and_loss_func(), batch=batch_for_pipeline, model=self.model, forward_only=False, tensor_shape=tensor_shape, dtype=self.autocast_dtype, grad_scaler=self.trainer.precision_plugin.scaler if self.cfg.precision == 16 else None, ) else: losses_reduced_per_micro_batch = forward_backward_no_pipelining( forward_step_func=self.get_forward_output_and_loss_func(), batch=batch_for_pipeline, model=self.model, forward_only=False, tensor_shape=tensor_shape, dtype=self.autocast_dtype, grad_scaler=self.trainer.precision_plugin.scaler if self.cfg.precision == 16 else None, ) # only the last stages of the pipeline return losses if losses_reduced_per_micro_batch: # average loss across micro batches loss_tensors_list = [ loss_reduced['avg'] for loss_reduced in losses_reduced_per_micro_batch ] loss_tensor = torch.concat(loss_tensors_list) loss_mean = loss_tensor.mean() else: loss_mean = torch.tensor(0.0).cuda() # TODO: if we're not using pipeline, then we should do async allreduce (better perf) # in order to do this with O2, we need the async handler to be added to apex fwd/bwd function if self.megatron_amp_o2: # main grads are stored in the MainParamsOptimizer wrapper self._optimizer.allreduce_main_grads( ) # @sangkug we think this is fine self.allreduce_first_last_embeddings() else: self.allreduce_gradients( ) # @sangkug we think this is causing memory to blow up (hurts perf) self.allreduce_first_last_embeddings() ## logging # we can only log on one rank if it is rank zero so we broadcast from last rank # we can avoid this broadcast by updating the PTL log function to accept specific ranks torch.distributed.broadcast(loss_mean, get_last_rank()) if self.cfg.precision == 16: loss_scale = self.trainer.precision_plugin.scaler._scale if loss_scale is not None: self.log('loss_scale', loss_scale) self.log('reduced_train_loss', loss_mean, prog_bar=True, rank_zero_only=True) lr = self._optimizer.param_groups[0]['lr'] self.log('lr', lr, rank_zero_only=True) self.log('global_step', self.trainer.global_step, prog_bar=True, rank_zero_only=True) # TODO: make sure compute_consumed_samples works for pipeline parallelism self.log( 'consumed_samples', self.compute_consumed_samples(self.trainer.global_step - self.init_global_step), prog_bar=True, rank_zero_only=True, ) return loss_mean def on_train_batch_end(self, outputs, batch, batch_idx: int, unused: Optional[int] = 0) -> None: super().on_train_batch_end(outputs, batch, batch_idx) # TODO: Replace with newer override for scheduler.step() instead of # search for plugins for fp16 GradScalar if self.trainer.precision_plugin is not None and isinstance( self.trainer.precision_plugin, NativeMixedPrecisionPlugin): precision_plugin = self.trainer.precision_plugin if (hasattr(precision_plugin, 'scaler') and precision_plugin.scaler is not None and isinstance(precision_plugin.scaler, GradScaler)): grad_scaler = precision_plugin.scaler # If the grad scaler skipped its optimizer step due to infs/nans, # decrement the step of all schedulers. if grad_scaler.optimizer_update_skipped is not None and grad_scaler.optimizer_update_skipped is True: schedulers = self.trainer.lr_schedulers if not schedulers or not self.trainer.lightning_module.automatic_optimization: return for scheduler in schedulers: # Decrement the counter by 2, then perform a scheduler.step() to perform a no-up # as well as update the optimizer lr in all param groups scheduler['scheduler'].last_epoch -= 2 scheduler['scheduler'].step() # Increase the max step count by 1 self.trainer.fit_loop.max_steps = self.trainer.fit_loop.max_steps + 1 # Reset the optimizer update skipped to `None` - this is to prevent scheduler no-ops during # accumulated gradient updates. grad_scaler.optimizer_update_skipped = None def backward(self, *args, **kwargs): """ LightningModule hook to do backward. We want this to do nothing since we run backward in the fwd/bwd functions from apex. No need to call it here. """ return def optimizer_zero_grad(self, *args, **kwargs): """ LightningModule hook to zero grad. We want this to do nothing as we are zeroing grads during the training_step. """ return def allreduce_gradients(self): """Reduce gradients across data parallel ranks. Modified from megatron-lm: https://github.com/NVIDIA/Megatron-LM/blob/d41696840ed0a7edb7e0499eb82a48ae112d9bb3/megatron/model/distributed.py#L188 """ # Bucketize and all-reduce buckets = {} for param in self.parameters(): if param.requires_grad and param.grad is not None: tp = param.data.type() if tp not in buckets: buckets[tp] = [] buckets[tp].append(param) # param.main_grad = param.grad # For each bucket, all-reduce and copy all-reduced grads. for tp in buckets: bucket = buckets[tp] grads = [param.grad.data for param in bucket] coalesced = torch._utils._flatten_dense_tensors(grads) coalesced /= parallel_state.get_data_parallel_world_size() torch.distributed.all_reduce( coalesced, group=parallel_state.get_data_parallel_group()) for buf, synced in zip( grads, torch._utils._unflatten_dense_tensors(coalesced, grads)): buf.copy_(synced) def allreduce_first_last_embeddings(self): # Modified from megatron-lm: https://github.com/NVIDIA/Megatron-LM/blob/d41696840ed0a7edb7e0499eb82a48ae112d9bb3/megatron/training.py#L407 # All-reduce word_embeddings' grad across first and last stages to ensure # that word_embeddings parameters stay in sync. # This should only run for models that support pipelined model parallelism # (BERT and GPT-2). if parallel_state.get_pipeline_model_parallel_world_size() > 1 and ( parallel_state.is_pipeline_first_stage() or parallel_state.is_pipeline_last_stage()): if self.model.share_word_embeddings: word_embeddings_weight = self.model.word_embeddings_weight() if self.megatron_amp_o2: # O2 recipe stores a "main" copy of weights and grads grad = word_embeddings_weight.main_grad else: grad = word_embeddings_weight.grad torch.distributed.all_reduce( grad, group=parallel_state.get_embedding_group()) def get_forward_output_and_loss_func(self): def fwd_output_and_loss_func(batch, model): batch = [x.cuda(non_blocking=True) for x in batch] tokens, labels, loss_mask, attention_mask, position_ids = batch attention_mask = attention_mask[0:1] output_tensor = model(tokens, position_ids, attention_mask, labels) def loss_func(output_tensor): loss = self.loss_func(loss_mask, output_tensor) reduced_loss = average_losses_across_data_parallel_group( [loss]) return loss, {'avg': reduced_loss} return output_tensor, loss_func return fwd_output_and_loss_func def get_forward_output_only_func(self): def fwd_output_only_func(batch, model): extra_arg = {} if len(batch) == 3: batch = [x.cuda() for x in batch] tokens, attention_mask, position_ids = batch attention_mask = attention_mask[0:1] else: ( tokens, attention_mask, position_ids, set_inference_key_value_memory, inference_max_sequence_len, ) = batch tokens = tokens.cuda() attention_mask = attention_mask.cuda() position_ids = position_ids.cuda() attention_mask = attention_mask[0:1] extra_arg[ 'set_inference_key_value_memory'] = set_inference_key_value_memory[ 0].item() extra_arg[ 'inference_max_sequence_len'] = inference_max_sequence_len[ 0].item() output_tensor = model(tokens, position_ids, attention_mask, **extra_arg) def id_func(output_tensor): return output_tensor, {'logits': output_tensor} return output_tensor, id_func return fwd_output_only_func def on_pretrain_routine_start(self) -> None: # keep a copy of init_global_step self.init_global_step = self.trainer.global_step return super().on_pretrain_routine_start() def validation_step(self, batch, batch_idx): """ Our dataloaders produce a micro-batch and then we fetch a number of microbatches depending on the global batch size and model parallel size from the dataloader to produce a list of microbatches. The list of microbatches is then piped through the pipeline using Apex fwd/bwd functions. """ batch_for_pipeline = self.process_global_batch(batch) tensor_shape = [ self.cfg.encoder_seq_length, self.cfg.micro_batch_size, self.cfg.hidden_size ] if self.cfg.get('pipeline_model_parallel_size', 1) > 1: losses_reduced_per_micro_batch = forward_backward_pipelining_without_interleaving( forward_step_func=self.get_forward_output_and_loss_func(), batch=batch_for_pipeline, model=self.model, forward_only=True, tensor_shape=tensor_shape, dtype=self.autocast_dtype, ) else: losses_reduced_per_micro_batch = forward_backward_no_pipelining( forward_step_func=self.get_forward_output_and_loss_func(), batch=batch_for_pipeline, model=self.model, forward_only=True, tensor_shape=tensor_shape, dtype=self.autocast_dtype, ) if losses_reduced_per_micro_batch: # average loss across micro batches loss_tensors_list = [ loss_reduced['avg'] for loss_reduced in losses_reduced_per_micro_batch ] loss_tensor = torch.concat(loss_tensors_list) loss_mean = loss_tensor.mean() else: # we're not on the last pipeline stage so no losses loss_mean = [] return loss_mean def validation_epoch_end(self, outputs): if not outputs: return if parallel_state.is_pipeline_last_stage(): # only the last pipeline parallel stages return loss averaged_loss = torch.stack(outputs).mean() else: averaged_loss = torch.tensor(0.0).cuda() # we can only log on one rank if it is rank zero so we broadcast from last rank torch.distributed.broadcast(averaged_loss, get_last_rank()) self.log('val_loss', averaged_loss, prog_bar=True, rank_zero_only=True) self.log( 'consumed_samples', self.compute_consumed_samples(self.trainer.global_step - self.init_global_step), rank_zero_only=True, ) def test_step(self, batch, batch_idx): return self.validation_step(batch, batch_idx) def test_epoch_end(self, outputs): averaged_loss = average_losses_across_data_parallel_group(outputs) logging.info(f'test_loss: {averaged_loss[0]}') def loss_func(self, loss_mask, output_tensor): losses = output_tensor.float() loss_mask = loss_mask.view(-1).float() # TODO: add nemo version here loss = torch.sum(losses.view(-1) * loss_mask) / loss_mask.sum() # sequence level nll return loss def process_global_batch(self, global_batch): """ Prepares the global batch for apex fwd/bwd functions. Global batch is a list of micro batches. """ return [ global_batch["tokens"], global_batch["labels"], global_batch["loss_mask"], global_batch["attention_mask"], global_batch["position_ids"], ] def build_train_valid_test_datasets(self): logging.info('Building GPT datasets.') global_batch_size = self.cfg.global_batch_size max_train_steps = self.trainer.max_steps eval_iters = (max_train_steps // self.trainer.val_check_interval + 1) * self.trainer.limit_val_batches test_iters = self.trainer.limit_test_batches train_valid_test_num_samples = [ max_train_steps * global_batch_size, eval_iters * global_batch_size, test_iters * global_batch_size, ] self._train_ds, self._validation_ds, self._test_ds = build_train_valid_test_datasets( cfg=self.cfg, trainer=self.trainer, data_prefix=self.cfg.data.data_prefix, data_impl=self.cfg.data.data_impl, splits_string=self.cfg.data.splits_string, train_valid_test_num_samples=train_valid_test_num_samples, seq_length=self.cfg.data.seq_length, seed=self.cfg.seed, skip_warmup=self.cfg.data.get('skip_warmup', True), tokenizer=self.tokenizer, ) if self._train_ds is not None: logging.info(f'Length of train dataset: {len(self._train_ds)}') if self._validation_ds is not None: logging.info(f'Length of val dataset: {len(self._validation_ds)}') if self._test_ds is not None: logging.info(f'Length of test dataset: {len(self._test_ds)}') logging.info(f'Finished building GPT datasets.') return self._train_ds, self._validation_ds, self._test_ds def build_pretraining_data_loader(self, dataset, consumed_samples): """Buld dataloader given an input dataset.""" if dataset is None: return None logging.info( f'Building dataloader with consumed samples: {consumed_samples}') # Megatron sampler if hasattr(self.cfg.data, 'dataloader_type' ) and self.cfg.data.dataloader_type is not None: if self.cfg.data.dataloader_type == 'single': batch_sampler = MegatronPretrainingBatchSampler( total_samples=len(dataset), consumed_samples=consumed_samples, micro_batch_size=self.cfg.micro_batch_size, global_batch_size=self.cfg.global_batch_size, data_parallel_rank=parallel_state.get_data_parallel_rank(), data_parallel_size=parallel_state. get_data_parallel_world_size(), drop_last=self.cfg.get('drop_last', True), ) elif self.cfg.data.dataloader_type == 'cyclic': batch_sampler = MegatronPretrainingRandomBatchSampler( total_samples=len(dataset), consumed_samples=consumed_samples, micro_batch_size=self.cfg.micro_batch_size, global_batch_size=self.cfg.global_batch_size, data_parallel_rank=parallel_state.get_data_parallel_rank(), data_parallel_size=parallel_state. get_data_parallel_world_size(), drop_last=self.cfg.get('drop_last', True), ) else: raise ValueError( 'cfg.data.dataloader_type must be "single" or "cyclic"') else: raise ValueError( 'cfg.data.dataloader_type not found. Must be "single" or "cyclic"' ) return torch.utils.data.DataLoader( dataset, batch_sampler=batch_sampler, num_workers=self.cfg.data.num_workers, pin_memory=True, ) def setup(self, stage=None): """ PTL hook that is executed after DDP spawns. We setup datasets here as megatron datasets require DDP to instantiate. See https://pytorch-lightning.readthedocs.io/en/latest/common/lightning_module.html#setup for more information. Args: stage (str, optional): Can be 'fit', 'validate', 'test' or 'predict'. Defaults to None. """ resume_checkpoint_path = self.trainer._checkpoint_connector.resume_from_checkpoint_fit_path if resume_checkpoint_path: try: init_consumed_samples = int( float( re.findall(r"consumed_samples\=([0-9]+.[0-9]+)", resume_checkpoint_path)[0])) except (ValueError, TypeError): logging.warning( "Cannot parse the checkpoint file to get the consumed samples. assume it is zero." ) init_consumed_samples = 0 else: init_consumed_samples = 0 self.init_consumed_samples = init_consumed_samples if stage == 'predict': return else: # TODO: consider adding a ModelPT guard to check if model is being restored. # allowing restored models to optionally setup datasets self.build_train_valid_test_datasets() self.setup_training_data(self.cfg.data) self.setup_validation_data(self.cfg.data) self.setup_test_data(self.cfg.data) # when using pipeline model parallel the final stage need to initialize word embeddings if parallel_state.get_pipeline_model_parallel_world_size() > 1: self.model.sync_initial_word_embeddings() def setup_training_data(self, cfg): if hasattr(self, '_train_ds'): consumed_samples = self.compute_consumed_samples(0) logging.info( f'Setting up train dataloader with len(len(self._train_ds)): {len(self._train_ds)} and consumed samples: {consumed_samples}' ) self._train_dl = self.build_pretraining_data_loader( self._train_ds, consumed_samples) def setup_validation_data(self, cfg): if hasattr(self, '_validation_ds'): consumed_samples = 0 logging.info( f'Setting up validation dataloader with len(len(self._validation_ds)): {len(self._validation_ds)} and consumed samples: {consumed_samples}' ) self._validation_dl = self.build_pretraining_data_loader( self._validation_ds, consumed_samples) def setup_test_data(self, cfg): if hasattr(self, '_test_ds'): consumed_samples = 0 logging.info( f'Setting up test dataloader with len(len(self._test_ds)): {len(self._test_ds)} and consumed samples: {consumed_samples}' ) self._test_dl = self.build_pretraining_data_loader( self._test_ds, consumed_samples) def configure_optimizers(self): self.setup_optimization() # Wrap the baseline optimizer with the optimizer class with master parameters if self.megatron_amp_o2 and self._optimizer is not None: if self.cfg.precision == 'bf16': fp32_grad_accum = True contiguous_grad_bucket = True elif self.cfg.precision == 16: fp32_grad_accum = False # TODO: contiguous grad bucket for fp16 is also planned to be supported contiguous_grad_bucket = False raise ValueError( "fp16 training is not yet supported with O2. Please set megatron_amp_O2 to False in the model config." ) # TODO: this should be true when not using pipeline parallelism # we will support that for bf16 when we have async handler from apex # and we will support it for fp16 when we have it implemented in the O2 recipe async_grad_allreduce = False self._optimizer = MainParamsOptimizerWrapper( self._optimizer, fp32_grad_accum=fp32_grad_accum, contiguous_grad_bucket=contiguous_grad_bucket, async_grad_allreduce=async_grad_allreduce, ) assert self._trainer.max_steps is not None, "'max_steps' is missing in trainer config." sched_config = self._cfg.optim.sched sched_config['max_steps'] = self._trainer.max_steps self._scheduler = prepare_lr_scheduler( optimizer=self._optimizer, scheduler_config=sched_config, train_dataloader=self._train_dl) if self._scheduler is None: return self._optimizer else: return [self._optimizer], [self._scheduler] def compute_consumed_samples(self, steps_since_resume=0): app_state = AppState() consumed_samples = (self.init_consumed_samples + steps_since_resume * app_state.data_parallel_size * self.cfg.micro_batch_size * get_num_microbatches()) return int(consumed_samples) def configure_gradient_clipping(self, *args, **kwargs): """PTL hook to configure gradients. We use gradient clipping implementation from megatron-lm. """ clip_val = self.trainer.gradient_clip_val if clip_val is None: return clip_val = float(clip_val) if clip_val <= 0: return if self.megatron_amp_o2: # grep fp32 master parameters for gradient clipping parameters = self._optimizer.get_parameters() else: parameters = self.get_parameters() grad_norm = clip_grad_norm_fp32(parameters=parameters, max_norm=clip_val) self.log('grad_norm', grad_norm, rank_zero_only=True) def get_parameters(self): params = [] for param_group in self._optimizer_param_groups: for param in param_group['params']: params.append(param) return params def generate( self, inputs: Union[List[str], torch.Tensor, List[dict]], length_params: LengthParam, sampling_params: SamplingParam = None, ) -> OutputType: # check whether the DDP is initialized if parallel_state.is_unitialized(): def dummy(): return if self.trainer.strategy.launcher is not None: self.trainer.strategy.launcher.launch(dummy, trainer=self.trainer) self.trainer.strategy.setup_environment() # set the default sampling params if it is None. # default do greedy sampling if sampling_params is None: sampling_params = get_default_sampling_params() # set the default length params if it is None. # default do greedy sampling if length_params is None: length_params = get_default_length_params() return megatron_gpt_generate(self.cuda(), inputs, self.tokenizer, length_params, sampling_params) def predict_step(self, batch: Any, batch_idx: int, dataloader_idx: Optional[int] = None) -> Any: inference_config = self.get_inference_config() if inference_config is None: return None else: # need to overwrite some configuration, make it immutable inference_config = inference_config.copy() compute_logprob = inference_config['compute_logprob'] if compute_logprob: del inference_config['compute_logprob'] inference_config['inputs'] = batch inference_config['tokens_to_generate'] = 1 inference_config['all_probs'] = True inference_config["add_BOS"] = False inference_config['greedy'] = True response = generate(self, **inference_config) compute_prob_response = get_computeprob_response( self.tokenizer, response, batch) return compute_prob_response else: del inference_config['compute_logprob'] inference_config['inputs'] = batch return generate(self, **inference_config) def list_available_models(self): return None def _vocab_size_with_padding(self, orig_vocab_size, make_vocab_size_divisible_by, tensor_model_parallel_size): """Pad vocab size so it is divisible by model parallel size and still having GPU friendly size.""" after = orig_vocab_size multiple = make_vocab_size_divisible_by * tensor_model_parallel_size while (after % multiple) != 0: after += 1 logging.info( f'Padded vocab_size: {after}, original vocab_size: {orig_vocab_size}, dummy tokens: {after - orig_vocab_size}.' ) return after def _enable_nvidia_optimizations(self): "These optimizations are present in NVIDIA NGC PyTorch Containers" # Version check nvidia_torch_version = os.getenv('NVIDIA_PYTORCH_VERSION', None) if nvidia_torch_version is not None: NVIDIA_TORCH_MAJOR = int(nvidia_torch_version.split('.')[0]) NVIDIA_TORCH_MINOR = int(nvidia_torch_version.split('.')[1]) # Apex Persistent layer norm is supported from Nvidia PyTorch container v21.11 if NVIDIA_TORCH_MAJOR < 21 or (NVIDIA_TORCH_MAJOR == 21 and NVIDIA_TORCH_MINOR < 11): self.cfg.persist_layer_norm = False if NVIDIA_TORCH_MAJOR >= 21 or (NVIDIA_TORCH_MAJOR == 21 and NVIDIA_TORCH_MINOR >= 11): # NVFUSER torch._C._jit_set_profiling_executor(True) torch._C._jit_set_profiling_mode(True) torch._C._jit_override_can_fuse_on_cpu(False) torch._C._jit_override_can_fuse_on_gpu(False) torch._C._jit_set_texpr_fuser_enabled(False) torch._C._jit_set_nvfuser_enabled(True) torch._C._debug_set_autodiff_subgraph_inlining(False) else: # Not a Nvidia container. Dependency check is on users pass def transfer_batch_to_device(self, batch: Any, device: torch.device, dataloader_idx: int) -> Any: """ PTL hook: https://pytorch-lightning.readthedocs.io/en/latest/common/lightning_module.html#transfer-batch-to-device When using pipeline parallelism, we need the global batch to remain on the CPU, since the memory overhead will be too high when using a large number of microbatches. Microbatches are transferred from CPU to GPU inside the pipeline. """ return batch def _validate_trainer(self): """ Certain trainer configurations can break training. Here we try to catch them and raise an error. """ if self.trainer.accumulate_grad_batches > 1: raise ValueError( f'Gradient accumulation is done within training_step. trainer.accumulate_grad_batches must equal 1' ) @classmethod def list_available_models(cls) -> Optional[PretrainedModelInfo]: """ This method returns a list of pre-trained model which can be instantiated directly from NVIDIA's NGC cloud. Returns: List of available pre-trained models. """ result = [] result.append( PretrainedModelInfo( pretrained_model_name="megatron_gpt_345m", location= "https://api.ngc.nvidia.com/v2/models/nvidia/nemo/megatron_gpt_345m/versions/1/files/megatron_gpt_345m.nemo", description="345M parameter GPT generative Megatron model.", )) return result
class MegatronLMEncoderDecoderModel(MegatronBaseModel): """ Megatron encoder-decoder base class """ def __init__(self, cfg: DictConfig, trainer: Trainer): super().__init__(cfg, trainer=trainer) # build tokenizer (defaults to nemo supported tokenizers) self._build_tokenizer() # manipulate vocabulary (e.g., pad vocabulary for better efficiency) self._build_vocab() # TODO: create get_encoder_decoder_model()here for different losses (e..g, nll, vae, mim) self.enc_dec_model = MegatronTokenLevelEncoderDecoderModule( encoder_arch=cfg.encoder_arch, decoder_arch=cfg.decoder_arch, vocab_size=self.padded_vocab_size, hidden_size=cfg.hidden_size, max_position_embeddings=cfg.max_position_embeddings, num_layers=cfg.num_layers, num_attention_heads=cfg.num_attention_heads, apply_query_key_layer_scaling=cfg.get('apply_query_key_layer_scaling', True), kv_channels=cfg.get('kv_channels', None), ffn_hidden_size=cfg.ffn_hidden_size, num_tokentypes=0, parallel_output=True, pre_process=cfg.get('pre_process', True), post_process=cfg.get('post_process', True), init_method_std=cfg.get('init_method_std', 0.02), fp16_cross_entropy=cfg.get('fp16_lm_cross_entropy', False), use_cpu_initialization=cfg.get('use_cpu_initialization', False), hidden_dropout=cfg.get('hidden_dropout', 0.1), attention_dropout=cfg.get('attention_dropout', 0.1), precision=cfg.get('precision', 16), fp32_residual_connection=cfg.get('fp32_residual_connection', False), activations_checkpoint_method=cfg.get('activations_checkpoint_method', None), activations_checkpoint_num_layers=cfg.get('activations_checkpoint_num_layers', 1), layernorm_epsilon=cfg.get('layernorm_epsilon', 1e-5), persist_layer_norm=cfg.get('persist_layer_norm', False), bias_gelu_fusion=cfg.get('bias_gelu_fusion', True), masked_softmax_fusion=cfg.get('masked_softmax_fusion', True), onnx_safe=cfg.get('onnx_safe', False), activation=cfg.get('activation', 'gelu'), ) self.setup_optimizer_param_groups() self.megatron_amp_o2 = cfg.get('megatron_amp_O2', False) if self.megatron_amp_o2: # Pre-allocate the model on GPU to have master parameters allocated on the same device with matching data type self.enc_dec_model.cuda(torch.cuda.current_device()) # Model wrapper to convert both model and inputs to half precision self.enc_dec_model = Float16Module(module=self.enc_dec_model, precision=cfg.precision) def _build_tokenizer(self): """ Default tokenizer is based on available nemo tokenizers. Override this method to use an external tokenizer. All tokenizers are expected to provide compatible interface. Override default Encoder-decoder tokenizer to use legacy=True for sentencepiece. """ self.tokenizer = get_nmt_tokenizer( library=self._cfg.tokenizer.library, model_name=self._cfg.tokenizer.type, tokenizer_model=self.register_artifact("tokenizer_model", self._cfg.tokenizer.model), vocab_file=self.register_artifact("vocab_file", self._cfg.tokenizer.vocab_file), merges_file=self.register_artifact("merges_file", self._cfg.tokenizer.merge_file), legacy=True if self._cfg.tokenizer.library == 'sentencepiece' else False, ) def _build_vocab(self): """ Manipulate vocabulary (e.g., pad vocabulary for increased performance)/ """ # TODO: add config to allow to disable it? self.padded_vocab_size = self._vocab_size_with_padding( orig_vocab_size=self.tokenizer.vocab_size, make_vocab_size_divisible_by=self._cfg.get('make_vocab_size_divisible_by', 128), tensor_model_parallel_size=self._cfg.get('tensor_model_parallel_size', 1), ) def forward( self, encoder_input_ids, decoder_input_ids, encoder_attn_mask, decoder_attn_mask, tokentype_ids=None, lm_labels=None, enc_hidden_states=None, enc_output_mask=None, output_enc_hidden_only=False, enc_input=None, ): ret_dict = self.enc_dec_model( enc_input_ids=encoder_input_ids, dec_input_ids=decoder_input_ids, enc_attn_mask=encoder_attn_mask, dec_attn_mask=decoder_attn_mask, tokentype_ids=tokentype_ids, labels=lm_labels, enc_hidden_states=enc_hidden_states, enc_output_mask=enc_output_mask, output_enc_hidden_only=output_enc_hidden_only, enc_input=enc_input, ) return ret_dict def setup_optimizer_param_groups(self): """ModelPT override. Optimizer will get self._optimizer_param_groups""" self._optimizer_param_groups = _get_params_for_weight_decay_optimization([self.enc_dec_model]) def configure_optimizers(self): self.setup_optimization() # Wrap the baseline optimizer with the optimizer class with master parameters if self.megatron_amp_o2 and self._optimizer is not None: if self.cfg.precision == 'bf16': fp32_grad_accum = True contiguous_grad_bucket = True async_grad_allreduce = True elif self.cfg.precision == 16: fp32_grad_accum = False # TODO: contiguous grad bucket for fp16 is also planned to be supported contiguous_grad_bucket = False async_grad_allreduce = False self._optimizer = MainParamsOptimizerWrapper( self._optimizer, fp32_grad_accum=fp32_grad_accum, contiguous_grad_bucket=contiguous_grad_bucket, async_grad_allreduce=async_grad_allreduce, ) assert self._trainer.max_steps is not None, "'max_steps' is missing in trainer config." if hasattr(self._cfg.optim, 'sched'): sched_config = self._cfg.optim.sched sched_config['max_steps'] = self._trainer.max_steps self._scheduler = prepare_lr_scheduler( optimizer=self._optimizer, scheduler_config=sched_config, train_dataloader=self._train_dl ) if self._scheduler is None: return self._optimizer else: return [self._optimizer], [self._scheduler] def get_parameters(self): params = [] for param_group in self._optimizer_param_groups: for param in param_group['params']: params.append(param) return params def training_step(self, batch, batch_idx): tokens_enc, tokens_dec, loss_mask, labels, enc_mask, dec_mask = self.process_batch(batch) tokens_loss = itemgetter("tokens_loss")( self(tokens_enc, tokens_dec, enc_mask, dec_mask, tokentype_ids=None, lm_labels=labels,) ) loss = self.loss_func(loss_mask, tokens_loss) self.log('train_loss', loss) # Reduced loss for logging. This averages the loss across all workers unlike "loss" above which is specific to a DDP rank. reduced_loss = average_losses_across_data_parallel_group([loss]) # cache reduced loss while accumulating gradients self._reduced_loss_buffer.append(reduced_loss[0]) if (batch_idx + 1) % self.trainer.accumulate_grad_batches == 0: # Reduced loss for logging. average_reduced_loss = sum(self._reduced_loss_buffer) / len(self._reduced_loss_buffer) self.log('reduced_train_loss', average_reduced_loss, prog_bar=True) lr = self._optimizer.param_groups[0]['lr'] self.log('lr', lr) self.log('global_step', self.trainer.global_step, prog_bar=True) self.log( 'consumed_samples', self.compute_consumed_samples(self.trainer.global_step - self.init_global_step), prog_bar=True, ) self._reduced_loss_buffer = [] return loss def validation_step(self, batch, batch_idx): tokens_enc, tokens_dec, loss_mask, labels, enc_mask, dec_mask = self.process_batch(batch) tokens_loss = itemgetter("tokens_loss")( self(tokens_enc, tokens_dec, enc_mask, dec_mask, tokentype_ids=None, lm_labels=labels,) ) loss = self.loss_func(loss_mask, tokens_loss) reduced_loss = average_losses_across_data_parallel_group([loss]) return reduced_loss def validation_epoch_end(self, outputs): averaged_loss = average_losses_across_data_parallel_group(outputs) self.log('val_loss', averaged_loss[0], prog_bar=True) self.log('consumed_samples', self.compute_consumed_samples(self.trainer.global_step - self.init_global_step)) def test_step(self, batch, batch_idx): return self.validation_step(batch, batch_idx) def test_epoch_end(self, outputs): averaged_loss = average_losses_across_data_parallel_group(outputs) logging.info(f'test_loss: {averaged_loss[0]}') def loss_func(self, loss_mask, tokens_loss): """ This function takes as input per-token loss and masks non-required values. """ losses = tokens_loss.view(-1).float() loss_mask = loss_mask.view(-1).float() # TODO: add nemo version here loss = torch.sum(losses * loss_mask) / loss_mask.sum() # sequence level nll return loss def process_batch(self, batch): """Build the batch.""" keys = ['text_enc', 'text_dec', 'labels', 'loss_mask', 'enc_mask', 'dec_mask'] datatype = torch.int64 data = batch data_b = tensor_parallel.broadcast_data(keys, data, datatype) # Unpack. tokens_enc = data_b['text_enc'].long() tokens_dec = data_b['text_dec'].long() labels = data_b['labels'].long() loss_mask = data_b['loss_mask'].float() enc_mask = data_b['enc_mask'] dec_mask = data_b['dec_mask'] return tokens_enc, tokens_dec, loss_mask, labels, enc_mask, dec_mask def build_train_valid_test_datasets(self): raise NotImplementedError("Please implement this method in child-class") def build_pretraining_data_loader(self, dataset, consumed_samples): """Buld dataloader given an input dataset.""" if dataset is None: return None # Megatron sampler if self._cfg.data.dataloader_type == 'single': batch_sampler = MegatronPretrainingSampler( total_samples=len(dataset), consumed_samples=consumed_samples, micro_batch_size=self._cfg.micro_batch_size, data_parallel_rank=parallel_state.get_data_parallel_rank(), data_parallel_size=parallel_state.get_data_parallel_world_size(), ) elif self._cfg.data.dataloader_type == 'cyclic': batch_sampler = MegatronPretrainingRandomSampler( total_samples=len(dataset), consumed_samples=consumed_samples, micro_batch_size=self._cfg.micro_batch_size, data_parallel_rank=parallel_state.get_data_parallel_rank(), data_parallel_size=parallel_state.get_data_parallel_world_size(), ) else: raise Exception('{} dataloader type is not supported.'.format(self._cfg.dataloader_type)) # Torch dataloader. return torch.utils.data.DataLoader( dataset, batch_sampler=batch_sampler, num_workers=self._cfg.data.num_workers, pin_memory=True, ) def setup(self, stage=None): resume_checkpoint_path = self.trainer.checkpoint_connector.resume_from_checkpoint_fit_path if resume_checkpoint_path: try: init_consumed_samples = int( float(re.findall(r"consumed_samples\=([0-9]+.[0-9]+)", resume_checkpoint_path)[0]) ) except (ValueError, TypeError): logging.warning("Cannot parse the checkpoint file to get the consumed samples. assume it is zero.") init_consumed_samples = 0 else: init_consumed_samples = 0 self.init_consumed_samples = init_consumed_samples """A PTL method to setup the training, validation and test datasets.""" if stage == 'predict': return if self._train_dl is not None and self._validation_dl is not None: return self.build_train_valid_test_datasets() self.setup_training_data(self._cfg.data) self.setup_validation_data(self._cfg.data) self.setup_test_data(self._cfg.data) def on_pretrain_routine_start(self) -> None: # keep a copy of init_global_step self.init_global_step = self.trainer.global_step return super().on_pretrain_routine_start() def setup_training_data(self, cfg): if hasattr(self, '_train_ds'): consumed_samples = self.compute_consumed_samples(0) self._train_dl = self.build_pretraining_data_loader(self._train_ds, consumed_samples) def setup_validation_data(self, cfg): if hasattr(self, '_validation_ds'): consumed_samples = 0 self._validation_dl = self.build_pretraining_data_loader(self._validation_ds, consumed_samples) def setup_test_data(self, cfg): if hasattr(self, '_test_ds'): consumed_samples = 0 self._test_dl = self.build_pretraining_data_loader(self._test_ds, consumed_samples) def compute_consumed_samples(self, steps_since_resume=0): app_state = AppState() consumed_samples = ( self.init_consumed_samples + steps_since_resume * app_state.data_parallel_size * self._cfg.micro_batch_size * self.trainer.accumulate_grad_batches ) return int(consumed_samples) def configure_gradient_clipping(self, *args, **kwargs): """PTL hook to configure gradients. We use gradient clipping implementation from megatron-lm. """ clip_val = self.trainer.gradient_clip_val if clip_val is None: return clip_val = float(clip_val) if clip_val <= 0: return if self.megatron_amp_o2: # grep fp32 master parameters for gradient clipping parameters = self._optimizer.get_parameters() else: parameters = self.get_parameters() grad_norm = clip_grad_norm_fp32(parameters=parameters, max_norm=clip_val) self.log('grad_norm', grad_norm, rank_zero_only=True) def predict_step(self, batch: Any, batch_idx: int, dataloader_idx: Optional[int] = None) -> Any: request = batch response = self.complete(request) logging.info(f"response: {response}") return response def decode(self, tokens_enc, enc_mask, num_tokens_to_generate, enc_input=None): # TODO: move method into a class inside MegatronTokenLevelEncoderDecoderModule (?) encoder_hidden_states, enc_output_mask = itemgetter("enc_output", "enc_output_mask")( self( encoder_input_ids=tokens_enc, decoder_input_ids=None, encoder_attn_mask=enc_mask, decoder_attn_mask=None, tokentype_ids=None, lm_labels=None, enc_hidden_states=None, enc_output_mask=None, output_enc_hidden_only=True, enc_input=enc_input, ) ) predicted_tokens_dec = ( torch.LongTensor([self.tokenizer.bos_id] * tokens_enc.size(0)).unsqueeze(1).to(tokens_enc.device) ) for _ in range(num_tokens_to_generate): dec_mask = predicted_tokens_dec != self.tokenizer.pad_id token_logits = itemgetter("token_logits")( self( encoder_input_ids=tokens_enc, decoder_input_ids=predicted_tokens_dec, encoder_attn_mask=enc_mask, decoder_attn_mask=dec_mask, tokentype_ids=None, lm_labels=None, enc_hidden_states=encoder_hidden_states, enc_output_mask=enc_output_mask, output_enc_hidden_only=False, enc_input=enc_input, ) ) token_logits = tensor_parallel.gather_from_tensor_model_parallel_region(token_logits) log_probs, token_ids = torch.max(nn.functional.log_softmax(token_logits, dim=-1), dim=-1) predicted_tokens_dec = torch.cat([predicted_tokens_dec, token_ids[:, -1].unsqueeze(1)], 1) return predicted_tokens_dec, log_probs def complete(self, request: Dict): """ Autoregressively invokes language model in the inference mode Args: request: Dictionary with the following fields * prompt: a string which text the model should complete. * tokens_to_generate: how many tokens to generate while doing prompt completion. Returns: response: A python dictionary with the following fields * prompt: original text of the prompt * tokenized_prompt: list of (str) tokens from prompt * completion: a python dictionary with the following subfields: * tokens: a list of triples (token, token_id, log_prob) comprising completion * text: completion text (as a single string) """ response = {} self.freeze() # naive greedy slow loop # TODO: add option for BeamSearchDecoder response['prompt'] = request['prompt'][0] response['completion'] = {} tokens_enc = request['masked_sample'] response['masked_input'] = ' '.join(self.tokenizer.ids_to_tokens(tokens_enc[0])) enc_mask = tokens_enc != self.tokenizer.pad_id predicted_tokens_ids, log_probs = self.decode(tokens_enc, enc_mask, int(request['tokens_to_generate'])) predicted_tokens_ids = predicted_tokens_ids.cpu().numpy()[0].tolist() log_probs = log_probs.cpu().numpy()[0].tolist() if self.tokenizer.eos_id in predicted_tokens_ids: idx = predicted_tokens_ids.index(self.tokenizer.eos_id) predicted_tokens_ids = predicted_tokens_ids[:idx] else: predicted_tokens_ids = [id for id in predicted_tokens_ids if id != self.tokenizer.pad_id] predicted_tokens_dec = self.tokenizer.ids_to_tokens(predicted_tokens_ids) response['completion']['text'] = self.tokenizer.tokens_to_text(predicted_tokens_dec) response['completion']['tokens'] = list(zip(predicted_tokens_ids, predicted_tokens_dec, log_probs)) self.unfreeze() return response def _vocab_size_with_padding(self, orig_vocab_size, make_vocab_size_divisible_by, tensor_model_parallel_size): """Pad vocab size so it is divisible by model parallel size and still having GPU friendly size.""" after = orig_vocab_size multiple = make_vocab_size_divisible_by * tensor_model_parallel_size while (after % multiple) != 0: after += 1 logging.info( f'Padded vocab_size: {after}, original vocab_size: {orig_vocab_size}, dummy tokens: {after - orig_vocab_size}.' ) return after def _enable_nvidia_optimizations(self): "These optimizations are present in NVIDIA NGC PyTorch Containers" # Version check nvidia_torch_version = os.getenv('NVIDIA_PYTORCH_VERSION', None) if nvidia_torch_version is not None: NVIDIA_TORCH_MAJOR = int(nvidia_torch_version.split('.')[0]) NVIDIA_TORCH_MINOR = int(nvidia_torch_version.split('.')[1]) # Apex Persistent layer norm is supported from Nvidia PyTorch container v21.11 if NVIDIA_TORCH_MAJOR < 21 or (NVIDIA_TORCH_MAJOR == 21 and NVIDIA_TORCH_MINOR < 11): self._cfg.persist_layer_norm = False if NVIDIA_TORCH_MAJOR >= 21 or (NVIDIA_TORCH_MAJOR == 21 and NVIDIA_TORCH_MINOR >= 11): # NVFUSER torch._C._jit_set_profiling_executor(True) torch._C._jit_set_profiling_mode(True) torch._C._jit_override_can_fuse_on_cpu(False) torch._C._jit_override_can_fuse_on_gpu(False) torch._C._jit_set_texpr_fuser_enabled(False) torch._C._jit_set_nvfuser_enabled(True) torch._C._debug_set_autodiff_subgraph_inlining(False) else: # Not a Nvidia container. Dependency check is on users pass def list_available_models(self): pass