def aggregate_logging_outputs(logging_outputs): """Aggregate logging outputs from data parallel training.""" loss_sum = utils.item(sum(log.get('loss', 0) for log in logging_outputs)) ntokens = utils.item(sum(log.get('ntokens', 0) for log in logging_outputs)) nsentences = utils.item(sum(log.get('nsentences', 0) for log in logging_outputs)) sample_size = utils.item(sum(log.get('sample_size', 0) for log in logging_outputs)) agg_output = { 'loss': loss_sum / sample_size / math.log(2), 'ntokens': ntokens, 'nsentences': nsentences, 'sample_size': sample_size, } if sample_size != ntokens: agg_output['nll_loss'] = loss_sum / ntokens / math.log(2) correct = sum(log.get("correct", 0) for log in logging_outputs) total = sum(log.get("count", 0) for log in logging_outputs) if total > 0: agg_output['accuracy'] = correct / total builtin_keys = {'loss', 'ntokens', 'nsentences', 'sample_size', 'correct', 'count'} for k in logging_outputs[0]: if k not in builtin_keys: val = sum(log.get(k, 0) for log in logging_outputs) / len(logging_outputs) if k.startswith('loss'): val = val / ntokens if ntokens > 0 else float('nan') agg_output[k] = val return agg_output
def reduce_metrics(logging_outputs) -> None: """Aggregate logging outputs from data parallel training.""" loss_sum = utils.item( sum(log.get('loss', 0) for log in logging_outputs)) nll_loss_sum = utils.item( sum(log.get('nll_loss', 0) for log in logging_outputs)) alignment_loss_sum = utils.item( sum(log.get('alignment_loss', 0) for log in logging_outputs)) ntokens = utils.item( sum(log.get('ntokens', 0) for log in logging_outputs)) sample_size = utils.item( sum(log.get('sample_size', 0) for log in logging_outputs)) metrics.log_scalar('loss', loss_sum / sample_size / math.log(2), sample_size, round=3) metrics.log_scalar('nll_loss', nll_loss_sum / ntokens / math.log(2), ntokens, round=3) metrics.log_scalar('alignment_loss', alignment_loss_sum / sample_size / math.log(2), sample_size, round=3) metrics.log_derived( 'ppl', lambda meters: utils.get_perplexity(meters['nll_loss'].avg))
def forward(self, model, sample, reduce=True): """Compute the loss for the given sample. Returns a tuple with three elements: 1) the loss 2) the sample size, which is used as the denominator for the gradient 3) logging outputs to display while training """ net_output = model(**sample['net_input']) loss, nll_loss = self.compute_loss(model, net_output, sample, reduce=reduce) sample_size = sample['target'].size( 0) if self.sentence_avg else sample['ntokens'] logging_output = { 'loss': utils.item(loss.data) if reduce else loss.data, 'nll_loss': utils.item(nll_loss.data) if reduce else nll_loss.data, 'ntokens': sample['ntokens'], 'nsentences': sample['target'].size(0), 'sample_size': sample_size, } alignment_loss = None # Compute alignment loss only for training set and non dummy batches. if 'alignments' in sample and sample['alignments'] is not None: alignment_loss = self.compute_alignment_loss(sample, net_output) if alignment_loss is not None: logging_output['alignment_loss'] = utils.item(alignment_loss.data) loss += self.alignment_lambda * alignment_loss return loss, sample_size, logging_output
def upgrade_state_dict_named(self, state_dict, name): """Upgrade a (possibly old) state dict for new versions of fairseq.""" if isinstance(self.embed_positions, SinusoidalPositionalEmbedding): weights_key = "{}.embed_positions.weights".format(name) if weights_key in state_dict: del state_dict[weights_key] state_dict["{}.embed_positions._float_tensor".format( name)] = torch.FloatTensor(1) for i in range(self.num_layers): # update layer norms layer_norm_map = { "0": "self_attn_layer_norm", "1": "encoder_attn_layer_norm", "2": "final_layer_norm", } for old, new in layer_norm_map.items(): for m in ("weight", "bias"): k = "{}.layers.{}.layer_norms.{}.{}".format( name, i, old, m) if k in state_dict: state_dict["{}.layers.{}.{}.{}".format( name, i, new, m)] = state_dict[k] del state_dict[k] version_key = "{}.version".format(name) if utils.item(state_dict.get(version_key, torch.Tensor([1]))[0]) <= 2: # earlier checkpoints did not normalize after the stack of layers self.layer_norm = None self.normalize = False state_dict[version_key] = torch.Tensor([1]) return state_dict
def upgrade_state_dict_named_(self, state_dict, name): # "Upgrade a (possibly old) state dict for new versions of fairseq." if isinstance(self.embed_positions, SinusoidalPositionalEmbedding): weights_key = "{}embed_positions.weight".format(name) if weights_key in state_dict: print("deleting {0}".format(weights_key)) del state_dict[weights_key] state_dict["{}embed_positions._float_tensor".format( name)] = torch.FloatTensor(1) for i in range(self.num_layers): # update layer norms self.layers[i].upgrade_state_dict_named( state_dict, "{}layers.{}".format(name, i)) version_key = "{}version".format(name) if utils.item(state_dict.get(version_key, torch.Tensor([1]))[0]) < 2: # earlier checkpoints did not normalize after the stack of layers self.layer_norm = None self.normalize = False state_dict[version_key] = torch.Tensor([1]) return state_dict
def forward(self, model, sample, reduce=True): """Compute the loss for the given sample. Returns a tuple with three elements: 1) the loss 2) the sample size, which is used as the denominator for the gradient 3) logging outputs to display while training """ lm_logits, output_metadata = model(**sample["net_input"]) # reshape lm_logits from (N,T,C) to (N*T,C) lm_logits = lm_logits.view(-1, lm_logits.size(-1)) lm_targets = sample['lm_target'].view(-1) lm_loss = compute_cross_entropy_loss(lm_logits, lm_targets, self.padding_idx) # compute the number of tokens for which loss is computed. This is used # to normalize the loss ntokens = utils.strip_pad(lm_targets, self.padding_idx).numel() loss = lm_loss / ntokens nsentences = sample['nsentences'] # nsentences = 0 # Compute sentence loss if masked_lm_only is False sentence_loss = None if not self.masked_lm_only: sentence_logits = output_metadata['sentence_logits'] sentence_targets = sample['sentence_target'].view(-1) # This needs to be recomputed due to some differences between # TokenBlock and BlockPair dataset. This can be resolved with a # refactor of BERTModel which we will do in the future. # TODO: Remove this after refactor of BERTModel nsentences = sentence_targets.size(0) # Check for logits being none which can happen when remove_heads # is set to true in the BERT model. Ideally we should set # masked_lm_only to true in this case, but that requires some # refactor in the BERT model. if sentence_logits is not None: sentence_loss = compute_cross_entropy_loss( sentence_logits, sentence_targets) loss += self.nsp_loss_weight * (sentence_loss / nsentences) # NOTE: as we are summing up per token mlm loss and per sentence nsp loss # we don't need to use sample_size as denominator for the gradient # here sample_size is just used for logging sample_size = 1 logging_output = { 'loss': utils.item(loss.data) if reduce else loss.data, 'lm_loss': utils.item(lm_loss.data) if reduce else lm_loss.data, # sentence loss is not always computed 'sentence_loss': ((utils.item(sentence_loss.data) if reduce else sentence_loss.data) if sentence_loss is not None else 0.0), 'ntokens': ntokens, 'nsentences': nsentences, 'sample_size': sample_size, } return loss, sample_size, logging_output