def param_hook(*unused): """Gradient accumulation and all-reduce.""" if param.grad.data is None: return if main_param.grad is None: main_param.grad = param.grad.float() else: main_param.grad.add_(param.grad.data) # Deallocate grad memory. param.grad = None # Asynchronous gradients allreduce across data_parallel ranks if self._require_backward_grad_sync: if self._grad_allreduce_chunk_size_mb > 0: self._main_grad_buffers[i].update_chunk_info( grad_chunk_info) while True: allreduce_tensor = self._main_grad_buffers[ i].get_allreduce_tensor() if allreduce_tensor is None: break allreduce_tensor.div_(get_data_parallel_world_size()) torch.distributed.all_reduce( allreduce_tensor, group=get_data_parallel_group(), async_op=True) else: main_param.grad.div_(get_data_parallel_world_size()) torch.distributed.all_reduce( # type: ignore main_param.grad, group=get_data_parallel_group(), async_op=True)
def average_losses_across_data_parallel_group(losses): """Reduce a tensor of losses across all GPUs.""" averaged_losses = torch.cat([loss.clone().detach().view(1) for loss in losses]) torch.distributed.all_reduce(averaged_losses, group=parallel_state.get_data_parallel_group()) averaged_losses = averaged_losses / torch.distributed.get_world_size( group=parallel_state.get_data_parallel_group() ) return averaged_losses
def init_model_parallel(self, global_rank: int, world_size: int) -> None: """ Initializes Megatron-LM model parallel if using model parallelism. Args: global_rank (int): the global process index. world_size (int): the total number of GPUs, num_nodes * num_gpus is_slurm_managing_tasks (bool, optional): is the cluster managed by SLURM. """ app_state = AppState() # we initialize megatron-lm model parallel and data parallel groups # after initializing DDP with PTL. if app_state.model_parallel_size is not None: if torch.distributed.is_initialized(): parallel_state.initialize_model_parallel( app_state.model_parallel_size) app_state.model_parallel_group = parallel_state.get_tensor_model_parallel_group( ) app_state.data_parallel_group = parallel_state.get_data_parallel_group( ) app_state.model_parallel_rank = parallel_state.get_tensor_model_parallel_rank( ) app_state.data_parallel_rank = parallel_state.get_data_parallel_rank( ) app_state.data_parallel_size = parallel_state.get_data_parallel_world_size( ) logging.info(f'mp_rank: {app_state.model_parallel_rank}') logging.info(f'dp_rank: {app_state.data_parallel_rank}')
def allreduce_gradients(self): """Reduce gradients across data parallel ranks. Modified from megatron-lm: https://github.com/NVIDIA/Megatron-LM/blob/d41696840ed0a7edb7e0499eb82a48ae112d9bb3/megatron/model/distributed.py#L188 """ # Bucketize and all-reduce buckets = {} for param in self.parameters(): if param.requires_grad and param.grad is not None: tp = param.data.type() if tp not in buckets: buckets[tp] = [] buckets[tp].append(param) # param.main_grad = param.grad # For each bucket, all-reduce and copy all-reduced grads. for tp in buckets: bucket = buckets[tp] grads = [param.grad.data for param in bucket] coalesced = torch._utils._flatten_dense_tensors(grads) coalesced /= parallel_state.get_data_parallel_world_size() torch.distributed.all_reduce( coalesced, group=parallel_state.get_data_parallel_group()) for buf, synced in zip( grads, torch._utils._unflatten_dense_tensors(coalesced, grads)): buf.copy_(synced)
def configure_ddp(self): """ Override LightningModule ddp if using model parallel. Sets find_unused_parameters to False to use activation-checkpoint-recomputation. """ app_state = AppState() if app_state.model_parallel_size is not None: logging.info(f"Configuring DDP for model parallelism.") # With model parallelism, multiple GPUs form a large "logical GPU" # this means that data parallel groups span multiple GPUs # and are non-trivial # TODO: for megatron-lm self.model is a list self.pre_configure_ddp() # device_ids = self.determine_ddp_device_ids() self._model = DistributedDataParallel( LightningDistributedModule(self.model), process_group=parallel_state.get_data_parallel_group(), **self._ddp_kwargs, ) if self.no_ddp_communication_hook: # When using custom gradient accumulation and allreduce, disable # DDP communication hook that works on the gradient bucket. # Instead, use the custom gradient function and communication hook, # which is defined in the master optimizer wrapper. self._model.require_backward_grad_sync = False self._model.register_comm_hook(None, noop_hook) else: super().configure_ddp()
def param_hook(*unused): # Accumulates gradients on main gradients if param.grad.data is not None: if main_param.grad is None: main_param.grad = param.grad.float() else: main_param.grad.add_(param.grad.data) # Deallocate grad memory. param.grad = None # Asynchronous gradients allreduce accross data_parallel ranks if self._require_backward_grad_sync: main_param.grad.div_(get_data_parallel_world_size()) torch.distributed.all_reduce( main_param.grad, group=get_data_parallel_group(), async_op=True)
def init_model_parallel(self, global_rank: int, world_size: int) -> None: """ Initializes Megatron-LM model parallel if using model parallelism. Args: global_rank (int): the global process index. world_size (int): the total number of GPUs, num_nodes * num_devices is_slurm_managing_tasks (bool, optional): is the cluster managed by SLURM. """ app_state = AppState() # we initialize megatron-lm model parallel and data parallel groups # after initializing DDP with PTL. if app_state.model_parallel_size is not None: # destroy groups in case they have already been created # this happens with multiple calls to trainer.test for example parallel_state.destroy_model_parallel() if torch.distributed.is_initialized(): parallel_state.initialize_model_parallel( tensor_model_parallel_size_=app_state. tensor_model_parallel_size, pipeline_model_parallel_size_=app_state. pipeline_model_parallel_size, pipeline_model_parallel_split_rank_=app_state. pipeline_model_parallel_split_rank, ) # assert that fake tp and pp rank match after model parallel init assert app_state.tensor_model_parallel_rank == parallel_state.get_tensor_model_parallel_rank( ) assert app_state.pipeline_model_parallel_rank == parallel_state.get_pipeline_model_parallel_rank( ) app_state.tensor_model_parallel_group = parallel_state.get_tensor_model_parallel_group( ) app_state.data_parallel_group = parallel_state.get_data_parallel_group( ) app_state.data_parallel_rank = parallel_state.get_data_parallel_rank( ) app_state.data_parallel_size = parallel_state.get_data_parallel_world_size( ) app_state.pipeline_model_parallel_group = parallel_state.get_pipeline_model_parallel_group( )
def test_initialize_model_parallel(tensor_model_parallel_size): if torch.distributed.get_rank() == 0: print('> testing initialize_model_parallel with size {} ...'.format( tensor_model_parallel_size)) tensor_model_parallel_size_ = min( tensor_model_parallel_size, torch.distributed.get_world_size(), ) assert not parallel_state.model_parallel_is_initialized() parallel_state.initialize_model_parallel(tensor_model_parallel_size_) assert parallel_state.model_parallel_is_initialized() # Checks. def check(group, world_size, rank): assert world_size == torch.distributed.get_world_size(group=group) assert rank == torch.distributed.get_rank(group=group) # Model parallel. world_size = tensor_model_parallel_size_ rank = torch.distributed.get_rank() % tensor_model_parallel_size_ assert world_size == parallel_state.get_tensor_model_parallel_world_size() assert rank == parallel_state.get_tensor_model_parallel_rank() check(parallel_state.get_tensor_model_parallel_group(), world_size, rank) # Data parallel. world_size = torch.distributed.get_world_size( ) // tensor_model_parallel_size_ rank = torch.distributed.get_rank() // tensor_model_parallel_size assert world_size == parallel_state.get_data_parallel_world_size() assert rank == parallel_state.get_data_parallel_rank() check(parallel_state.get_data_parallel_group(), world_size, rank) # Reset groups parallel_state.destroy_model_parallel() torch.distributed.barrier() if torch.distributed.get_rank() == 0: print(TEST_SUCCESS_MESSAGE)
def _build_index_mappings(name, data_prefix, documents, sizes, num_samples, seq_length, seed, index_mapping_dir: str = None): """Build doc-idx, sample-idx, and shuffle-idx. doc-idx: is an array (ordered) of documents to be used in training. sample-idx: is the start document index and document offset for each training sample. shuffle-idx: maps the sample index into a random index into sample-idx. """ # Number of tokens in each epoch and number of required epochs. tokens_per_epoch = _num_tokens(documents, sizes) num_epochs = _num_epochs(tokens_per_epoch, seq_length, num_samples) # rng state np_rng = np.random.RandomState(seed=seed) # Filename of the index mappings. if index_mapping_dir is not None: _filename = os.path.join(index_mapping_dir, os.path.basename(data_prefix)) else: _filename = data_prefix _filename += '_{}_indexmap'.format(name) _filename += '_{}ns'.format(num_samples) _filename += '_{}sl'.format(seq_length) _filename += '_{}s'.format(seed) doc_idx_filename = _filename + '_doc_idx.npy' sample_idx_filename = _filename + '_sample_idx.npy' shuffle_idx_filename = _filename + '_shuffle_idx.npy' # Build the indexed mapping if not exist. if torch.distributed.get_rank() == 0: if ((not os.path.isfile(doc_idx_filename)) or (not os.path.isfile(sample_idx_filename)) or (not os.path.isfile(shuffle_idx_filename))): logging.info( ' > WARNING: could not find index map files, building ' 'the indices on rank 0 ...') # For the last epoch, decide whether include the entire epoch # in the global shuffle or not. # If we need only one epoch, then separating last epoch does # not mean anything. if num_epochs == 1: separate_last_epoch = False print( ' > only one epoch required, setting ' 'separate_last_epoch to False', flush=True) else: # Get the number of samples for the last epoch num_samples_from_epochs_minus_one = ( (num_epochs - 1) * tokens_per_epoch - 1) // seq_length last_epoch_num_samples = num_samples - num_samples_from_epochs_minus_one assert last_epoch_num_samples >= 0, 'last epoch number of samples should be non-negative.' num_samples_per_epoch = (tokens_per_epoch - 1) // seq_length assert last_epoch_num_samples < ( num_samples_per_epoch + 1), 'last epoch number of samples exceeded max value.' # If we have less than 80% of the samples for the last epoch, # seperate out the epoch and treat it differently. # Note: the 80% number is just based on common sense and can # be adjusted if needed. separate_last_epoch = last_epoch_num_samples < int( 0.80 * num_samples_per_epoch) if separate_last_epoch: string = ( ' > last epoch number of samples ({}) is smaller ' 'than 80% of number of samples per epoch ({}), ' 'setting separate_last_epoch to True') else: string = (' > last epoch number of samples ({}) is larger ' 'than 80% of number of samples per epoch ({}), ' 'setting separate_last_epoch to False') print(string.format(last_epoch_num_samples, num_samples_per_epoch), flush=True) # doc-idx. start_time = time.time() doc_idx = _build_doc_idx(documents, num_epochs, np_rng, separate_last_epoch) np.save(doc_idx_filename, doc_idx, allow_pickle=True) logging.info(' > elasped time to build and save doc-idx mapping ' '(seconds): {:4f}'.format(time.time() - start_time)) # sample-idx. start_time = time.time() # Use C++ implementation for speed. # First compile and then import. assert doc_idx.dtype == np.int32 assert sizes.dtype == np.int32 try: from nemo.collections.nlp.data.language_modeling.megatron.dataset_utils import compile_helper compile_helper() from nemo.collections.nlp.data.language_modeling.megatron import helpers except ImportError: raise ImportError( f'Could not compile megatron dataset C++ helper functions and therefore cannot import helpers python file.' ) sample_idx = helpers.build_sample_idx(sizes, doc_idx, seq_length, num_epochs, tokens_per_epoch) # sample_idx = _build_sample_idx(sizes, doc_idx, seq_length, # num_epochs, tokens_per_epoch) np.save(sample_idx_filename, sample_idx, allow_pickle=True) logging.info( ' > elasped time to build and save sample-idx mapping ' '(seconds): {:4f}'.format(time.time() - start_time)) # shuffle-idx. start_time = time.time() # -1 is due to data structure used to retieve the index: # sample i --> [sample_idx[i], sample_idx[i+1]) if separate_last_epoch: num_samples_ = num_samples_from_epochs_minus_one else: num_samples_ = sample_idx.shape[0] - 1 shuffle_idx = _build_shuffle_idx(num_samples_, sample_idx.shape[0] - 1, np_rng) np.save(shuffle_idx_filename, shuffle_idx, allow_pickle=True) logging.info( ' > elasped time to build and save shuffle-idx mapping' ' (seconds): {:4f}'.format(time.time() - start_time)) torch.distributed.barrier() counts = torch.cuda.LongTensor([1]) torch.distributed.all_reduce( counts, group=parallel_state.get_data_parallel_group()) torch.distributed.all_reduce( counts, group=parallel_state.get_pipeline_model_parallel_group()) assert counts[0].item() == ( torch.distributed.get_world_size() // torch.distributed.get_world_size( group=parallel_state.get_tensor_model_parallel_group())) # Load mappings. start_time = time.time() logging.info(' > loading doc-idx mapping from {}'.format(doc_idx_filename)) doc_idx = np.load(doc_idx_filename, allow_pickle=True, mmap_mode='r') logging.info( ' > loading sample-idx mapping from {}'.format(sample_idx_filename)) sample_idx = np.load(sample_idx_filename, allow_pickle=True, mmap_mode='r') logging.info( ' > loading shuffle-idx mapping from {}'.format(shuffle_idx_filename)) shuffle_idx = np.load(shuffle_idx_filename, allow_pickle=True, mmap_mode='r') logging.info( ' loaded indexed file in {:3.3f} seconds'.format(time.time() - start_time)) logging.info(' total number of samples: {}'.format(sample_idx.shape[0])) logging.info(' total number of epochs: {}'.format(num_epochs)) return doc_idx, sample_idx, shuffle_idx
def allreduce_buffer(self): """Synchronous buffer data allreduce""" self.data.div_(get_data_parallel_world_size()) torch.distributed.all_reduce( self.data, group=get_data_parallel_group()) # type: ignore
def build_model( model_provider_func: Callable[[Any, Dict[str, Any]], torch.nn.Module], wrap_with_ddp: bool = True, virtual_pipeline_model_parallel_size: Optional[int] = None, *args, **kwargs ) -> List[torch.nn.Module]: """Build the model satisfying pipeline model parallel requirements. This function sets `pre_process` and `post_process` to `**kwargs` and pass `*args` and `**kwargs` to `model_provider_func`. Args: model_provider_func: A function which takes `*args` and `**kwargs` and returns a `nn.Module`. wrap_with_ddp: If :obj:`True`, wrap the instantiated model with `torch.nn.parallel.distributed.DistributedDataParallel`, a.k.a. `DDP`. virtual_pipeline_model_parallel_size: Specify when using interleaving scheduling pipeline model parallel. *args: arguments for model provider func **kwargs: Keyword arguments for model provider func Returns: a list of `nn.Module`(s). If `virtual_pipeline_model_parallel_size` is not None, the list has multiple models, otherwise one. """ if ( parallel_state.get_pipeline_model_parallel_world_size() > 1 and virtual_pipeline_model_parallel_size is not None ): model = [] for i in range(virtual_pipeline_model_parallel_size): cur_args = args cur_kwargs = kwargs parallel_state.set_virtual_pipeline_model_parallel_rank(i) # Set pre_process and post_process only after virtual rank is set. pre_process = parallel_state.is_pipeline_first_stage() post_process = parallel_state.is_pipeline_last_stage() cur_kwargs.update({ "pre_process": pre_process, "post_process": post_process, }) this_model = model_provider_func(*cur_args, **cur_kwargs) model.append(this_model) else: cur_args = args cur_kwargs = kwargs pre_process = parallel_state.is_pipeline_first_stage() post_process = parallel_state.is_pipeline_last_stage() cur_kwargs.update({ "pre_process": pre_process, "post_process": post_process, }) model = model_provider_func(*cur_args, **cur_kwargs) if not isinstance(model, list): model = [model] # Set tensor model parallel attributes if not set. # Only parameters that are already tensor model parallel have these # attributes set for them. We should make sure the default attributes # are set for all params so the optimizer can use them. for model_module in model: for param in model_module.parameters(): set_defaults_if_not_set_tensor_model_parallel_attributes(param) # Print number of parameters. if parallel_state.get_data_parallel_rank() == 0: msg = " > number of parameters on (tensor, pipeline) model parallel rank ({}, {}): {}".format( parallel_state.get_tensor_model_parallel_rank(), parallel_state.get_pipeline_model_parallel_rank(), sum([sum([p.nelement() for p in model_module.parameters()]) for model_module in model]) ) print(msg, flush=True) # GPU allocation. for model_module in model: model_module.cuda(torch.cuda.current_device()) if wrap_with_ddp: i = torch.cuda.current_device() model = [ torch.nn.parallel.distributed.DistributedDataParallel( model_module, device_ids=[i], output_device=i, process_group=parallel_state.get_data_parallel_group(), ) for model_module in model ] return model
def get_samples_mapping(indexed_dataset, data_prefix, num_epochs, max_num_samples, max_seq_length, short_seq_prob, seed, name, binary_head): """Get a list that maps a sample index to a starting sentence index, end sentence index, and length""" if not num_epochs: if not max_num_samples: raise ValueError("Need to specify either max_num_samples " "or num_epochs") num_epochs = np.iinfo(np.int32).max - 1 if not max_num_samples: max_num_samples = np.iinfo(np.int64).max - 1 # Filename of the index mapping indexmap_filename = data_prefix indexmap_filename += '_{}_indexmap'.format(name) if num_epochs != (np.iinfo(np.int32).max - 1): indexmap_filename += '_{}ep'.format(num_epochs) if max_num_samples != (np.iinfo(np.int64).max - 1): indexmap_filename += '_{}mns'.format(max_num_samples) indexmap_filename += '_{}msl'.format(max_seq_length) indexmap_filename += '_{:0.2f}ssp'.format(short_seq_prob) indexmap_filename += '_{}s'.format(seed) indexmap_filename += '.npy' # Build the indexed mapping if not exist. if torch.distributed.get_rank( ) == 0 and not os.path.isfile(indexmap_filename): print(' > WARNING: could not find index map file {}, building ' 'the indices on rank 0 ...'.format(indexmap_filename)) # Make sure the types match the helpers input types. assert indexed_dataset.doc_idx.dtype == np.int64 assert indexed_dataset.sizes.dtype == np.int32 # Build samples mapping verbose = torch.distributed.get_rank() == 0 start_time = time.time() logging.info( ' > building samples index mapping for {} ...'.format(name)) # First compile and then import. try: if is_global_rank_zero(): compile_helper() from nemo.collections.nlp.data.language_modeling.megatron import helpers except ImportError: raise ImportError( f'Could not compile megatron dataset C++ helper functions and therefore cannot import helpers python file.' ) samples_mapping = helpers.build_mapping( indexed_dataset.doc_idx, indexed_dataset.sizes, num_epochs, max_num_samples, max_seq_length, short_seq_prob, seed, verbose, 2 if binary_head else 1, ) logging.info(' > done building samples index maping') np.save(indexmap_filename, samples_mapping, allow_pickle=True) logging.info( ' > saved the index mapping in {}'.format(indexmap_filename)) # Make sure all the ranks have built the mapping logging.info(' > elasped time to build and save samples mapping ' '(seconds): {:4f}'.format(time.time() - start_time)) torch.distributed.barrier() counts = torch.cuda.LongTensor([1]) torch.distributed.all_reduce( counts, group=parallel_state.get_data_parallel_group()) torch.distributed.all_reduce( counts, group=parallel_state.get_pipeline_model_parallel_group()) assert counts[0].item() == ( torch.distributed.get_world_size() // torch.distributed.get_world_size( group=parallel_state.get_tensor_model_parallel_group())) # Load indexed dataset. logging.info( ' > loading indexed mapping from {}'.format(indexmap_filename)) start_time = time.time() samples_mapping = np.load(indexmap_filename, allow_pickle=True, mmap_mode='r') logging.info( ' loaded indexed file in {:3.3f} seconds'.format(time.time() - start_time)) logging.info(' total number of samples: {}'.format( samples_mapping.shape[0])) return samples_mapping
def build_model( model_provider_func: Callable[[Any, Dict[str, Any]], torch.nn.Module], wrap_with_ddp: bool = True, virtual_pipeline_model_parallel_size: Optional[int] = None, model_type: ModelType = ModelType.encoder_or_decoder, *args: Any, **kwargs: Any, ) -> List[torch.nn.Module]: """Build the model satisfying pipeline model parallel requirements. This function sets `pre_process` and `post_process` to `**kwargs` and pass `*args` and `**kwargs` to `model_provider_func`. Args: model_provider_func: A function which takes `*args` and `**kwargs` and returns a `nn.Module`. wrap_with_ddp: If :obj:`True`, wrap the instantiated model with `torch.nn.parallel.distributed.DistributedDataParallel`, a.k.a. `DDP`. virtual_pipeline_model_parallel_size: Specify when using interleaving scheduling pipeline model parallel. model_type: *args: arguments for model provider func **kwargs: Keyword arguments for model provider func Returns: a list of `nn.Module`(s). If `virtual_pipeline_model_parallel_size` is not None, the list has multiple models, otherwise one. """ if (parallel_state.get_pipeline_model_parallel_world_size() > 1 and virtual_pipeline_model_parallel_size is not None): model = [] for i in range(virtual_pipeline_model_parallel_size): cur_args = args cur_kwargs = kwargs parallel_state.set_virtual_pipeline_model_parallel_rank(i) # Set pre_process and post_process only after virtual rank is set. pre_process = parallel_state.is_pipeline_first_stage() post_process = parallel_state.is_pipeline_last_stage() cur_kwargs.update({ "pre_process": pre_process, "post_process": post_process, }) this_model = model_provider_func(*cur_args, **cur_kwargs) model.append(this_model) else: cur_args = args cur_kwargs = kwargs if model_type == ModelType.encoder_or_decoder: pre_process = parallel_state.is_pipeline_first_stage() post_process = parallel_state.is_pipeline_last_stage() cur_kwargs.update({ "pre_process": pre_process, "post_process": post_process, }) model = model_provider_func(*cur_args, **cur_kwargs) elif model_type == ModelType.encoder_and_decoder: pre_process = parallel_state.is_pipeline_first_stage() post_process = parallel_state.is_pipeline_last_stage() # `add_encoder` & `add_decoder` logic. add_encoder, add_decoder = True, True if parallel_state.get_pipeline_model_parallel_world_size() > 1: split_rank = parallel_state.get_pipeline_model_parallel_split_rank( ) if split_rank is None: raise RuntimeError( "Split rank needs to be specified for model with both encoder and decoder." ) rank = parallel_state.get_pipeline_model_parallel_rank() world_size = parallel_state.get_pipeline_model_parallel_world_size( ) pre_process = rank == 0 or rank == split_rank post_process = rank == (split_rank - 1) or rank == (world_size - 1) add_encoder = parallel_state.is_pipeline_stage_before_split() add_decoder = parallel_state.is_pipeline_stage_after_split() cur_kwargs.update({ "pre_process": pre_process, "post_process": post_process, "add_encoder": add_encoder, "add_decoder": add_decoder, }) model = model_provider_func(*cur_args, **cur_kwargs) model.model_type = model_type if not isinstance(model, list): model = [model] # Set tensor model parallel attributes if not set. # Only parameters that are already tensor model parallel have these # attributes set for them. We should make sure the default attributes # are set for all params so the optimizer can use them. for model_module in model: for param in model_module.parameters(): set_defaults_if_not_set_tensor_model_parallel_attributes(param) # Print number of parameters. if (parallel_state.model_parallel_is_initialized() and parallel_state.get_data_parallel_rank() == 0): msg = " > number of parameters on (tensor, pipeline) model parallel rank ({}, {}): {}".format( parallel_state.get_tensor_model_parallel_rank(), parallel_state.get_pipeline_model_parallel_rank(), _calc_number_of_params(model), ) print(msg, flush=True) # GPU allocation. for model_module in model: model_module.cuda(torch.cuda.current_device()) if wrap_with_ddp: i = torch.cuda.current_device() model = [ torch.nn.parallel.distributed.DistributedDataParallel( model_module, device_ids=[i], output_device=i, process_group=parallel_state.get_data_parallel_group(), ) for model_module in model ] return model
def eval_epoch_end(self, outputs, mode): if isinstance(outputs[0], dict): outputs = [outputs] loss_list = [] bleu_score_list = [] for dataloader_idx, output in enumerate(outputs): averaged_loss = average_losses_across_data_parallel_group( [x['loss'] for x in output]) inputs = list(itertools.chain(*[x['inputs'] for x in output])) translations = list( itertools.chain(*[x['translations'] for x in output])) ground_truths = list( itertools.chain(*[x['ground_truths'] for x in output])) assert len(translations) == len(inputs) assert len(translations) == len(ground_truths) # Gather translations and ground truths from all workers tr_gt_inp = [ None for _ in range(parallel_state.get_data_parallel_world_size()) ] # we also need to drop pairs where ground truth is an empty string torch.distributed.all_gather_object( tr_gt_inp, [(t, g, i) for (t, g, i) in zip(translations, ground_truths, inputs)], group=parallel_state.get_data_parallel_group(), ) if parallel_state.get_data_parallel_rank() == 0: _translations = [] _ground_truths = [] _inputs = [] # Deduplicate sentences that may have been distributed across multiple data parallel ranks. gt_inp_set = set() for rank in range( 0, parallel_state.get_data_parallel_world_size()): for t, g, i in tr_gt_inp[rank]: if g + i not in gt_inp_set: gt_inp_set.add(g + i) _translations.append(t) _ground_truths.append(g) _inputs.append(i) if self.tgt_language in ['ja']: sacre_bleu = corpus_bleu(_translations, [_ground_truths], tokenize="ja-mecab") elif self.tgt_language in ['zh']: sacre_bleu = corpus_bleu(_translations, [_ground_truths], tokenize="zh") else: sacre_bleu = corpus_bleu(_translations, [_ground_truths], tokenize="13a") bleu_score = sacre_bleu.score * parallel_state.get_data_parallel_world_size( ) dataset_name = "Validation" if mode == 'val' else "Test" logging.info( f"{dataset_name}, Dataloader index: {dataloader_idx}, Set size: {len(_translations)}" ) logging.info( f"{dataset_name}, Dataloader index: {dataloader_idx}, SacreBLEU = {bleu_score / parallel_state.get_data_parallel_world_size()}" ) logging.info( f"{dataset_name}, Dataloader index: {dataloader_idx}, Translation Examples:" ) logging.info( '============================================================' ) for example_idx in range(0, 3): random_index = random.randint(0, len(_translations) - 1) logging.info(" " + '\u0332'.join(f"Example {example_idx}:")) logging.info(f" Input: {_inputs[random_index]}") logging.info( f" Prediction: {_translations[random_index]}") logging.info( f" Ground Truth: {_ground_truths[random_index]}") logging.info( '============================================================' ) else: bleu_score = 0.0 loss_list.append(averaged_loss[0].cpu().numpy()) bleu_score_list.append(bleu_score) if dataloader_idx == 0: self.log(f'{mode}_sacreBLEU', bleu_score, sync_dist=True) self.log(f'{mode}_loss', averaged_loss[0], prog_bar=True) if self.multilingual: self._log_multilingual_bleu_and_loss( dataloader_idx, bleu_score, averaged_loss[0], mode) else: if self.multilingual: self._log_multilingual_bleu_and_loss( dataloader_idx, bleu_score, averaged_loss[0], mode) else: self.log(f'{mode}_sacreBLEU_dl_index_{dataloader_idx}', bleu_score, sync_dist=True) self.log(f'{mode}_loss_dl_index_{dataloader_idx}', averaged_loss[0], prog_bar=False) if len(loss_list) > 1: self.log(f"{mode}_loss_avg", np.mean(loss_list), sync_dist=True) self.log(f"{mode}_sacreBLEU_avg", np.mean(bleu_score_list), sync_dist=True)