Ejemplo n.º 1
0
def tldr_loss(model, batch, args):
    longer_sample = batch[0].to(args.gpu)
    inp = longer_sample[:, :args.train_batch_size]
    model_output = model(input_ids=inp)
    target = longer_sample[:, 1:args.train_batch_size + 1]
    logits = model_output[0]

    lprobs = F.log_softmax(logits, dim=-1)
    assert lprobs.size(0) == 1, 'We work on flat sequences'
    nll_loss = F.nll_loss(lprobs[0], target[0], reduction='sum')
    arange = np.arange(args.train_batch_size)
    lprobs_y = lprobs[:, arange, target]
    print(torch.sum(torch.cos(np.pi * lprobs_y.exp()) + 1 < 0.5))
    loss = ((torch.cos(np.pi * lprobs_y.exp()) + 1)**args.focal_gamma *
            (-lprobs_y)).sum()
    true_token_logits = -F.nll_loss(logits[0], target[0], reduction='none')
    ntokens = inp.numel()

    logging_output = TrainingMetrics.ranking_metrics(logits[0].float(),
                                                     true_token_logits, None,
                                                     ntokens, target[0])
    logging_output['loss'] = nll_loss.item()
    logging_output['tldr_loss'] = loss.item()
    logging_output['normalizer'] = ntokens
    logging_output['sample_size'] = ntokens
    logging_output['ntokens'] = ntokens

    loss = loss / ntokens

    return loss, logging_output
def eval_singletoken_argmax(model, args, dataset_paths, config,
                            train_iter=None, batch_size=None):
    batch_size = batch_size if batch_size is not None else args.batch_size_singletoken
    datasets = get_datasets(dataset_paths, max_len=batch_size)
    eval_sampler = SequentialSampler(datasets[args.eval_split])
    eval_dataloader = DataLoader(
        datasets[args.eval_split], sampler=eval_sampler, batch_size=1)

    model.eval()

    logging_outputs = []
    predicted_tokens = []
    target_tokens = []
    with torch.no_grad():
        for i, batch in tqdm(enumerate(eval_dataloader),
                             desc="Evaluating", total=len(eval_dataloader)):
            longer_sample = batch[0].to(args.gpu)
            inp = longer_sample[:, :args.batch_size_singletoken]
            model_output = model(input_ids=inp)
            target = longer_sample[:, 1:]
            logits = model_output[0]
            lprobs = F.log_softmax(logits, dim=-1)
            assert lprobs.size(0) == 1, 'We work on flat sequences'
            loss = F.nll_loss(lprobs[0], target[0], reduction='sum')
            true_token_logits = - \
                F.nll_loss(logits[0], target[0], reduction='none')

            pred = lprobs.argmax(dim=-1).view(-1).tolist()
            predicted_tokens.extend(pred)
            ntokens = inp.numel()

            logging_output = TrainingMetrics.ranking_metrics(
                logits[0].float(), true_token_logits, None, ntokens, target[0])
            logging_output['loss'] = loss.item()
            logging_output['normalizer'] = ntokens
            logging_output['sample_size'] = ntokens
            logging_output['ntokens'] = ntokens
            logging_outputs.append(logging_output)

            # for human uniq
            target_tokens.extend(target.view(-1).tolist())

    logging_average = CrossEntropyCriterionWCustomMetrics.aggregate_logging_outputs(
        logging_outputs)
    logging_average['ppl'] = 2 ** logging_average['loss']
    logging_average['uniq'] = len(set(predicted_tokens))
    logging_average['human_uniq'] = len(set(target_tokens))

    save_singletoken_metrics(
        logging_average,
        config.to_dict(),
        args,
        train_iter=train_iter)
    return logging_average
Ejemplo n.º 3
0
def mle_loss(model, batch, args):
    print("before", batch.pre_tru.shape)
    batch.pre_tru = truncate_batch(args, batch.pre_tru)
    bsz, newlen = batch.pre_tru.shape

    inp = batch.pre_tru
    print("after", batch.pre_tru.shape)
    set_trace()
    model_output = model(inp)
    target = batch.pre_tru[:, 1:].clone().detach()  # bsz, newlen
    logits = model_output[0]  # bsz, newlen, vocabsize
    _, __, vocabsize = logits.shape

    lprobs = F.log_softmax(logits, dim=-1)  # bsz, newlen

    loss = F.nll_loss(
        lprob.view(-1, vocabsize).contiguous(),
        target.view(-1).contiguous(),
        reduction='mean')  # reduction method on original code: 'sum'
    true_token_logits = -F.nll_loss(logits.view(-1, vocabsize).contiguous(),
                                    target.view(-1).contiguous(),
                                    reduction='none')
    #flatten shape of batches --> recover shape
    assert len(true_token_logits) == newlen * bsz
    true_token_logits = true_token_logits.view(bsz, newlen)

    ntokens = inp.numel()

    logging_output = TrainingMetrics.ranking_metrics(logits[0],
                                                     true_token_logits, None,
                                                     ntokens, target[0])
    logging_output['loss'] = loss.item()
    logging_output['normalizer'] = ntokens
    logging_output['sample_size'] = ntokens
    logging_output['ntokens'] = ntokens
    '''logging_output = { # from fairseq.custom.metrics
            'target_rank': utils.item(target_rank.data),
            'hits_at_1': utils.item(hits_at_1.data),
            'hits_at_10': utils.item(hits_at_10.data),
            'median_target_rank': utils.item(median_target_rank),  # NOTE: different normalization since it's not a sum
            'normalizer': ntokens,
            'repeat_topk/p_{}':
            'wrepeat_topk/p_{}':
            'nextunique_topk/p_{}':
        }'''

    #loss = loss / ntokens #covered above with reduction method
    return loss, logging_output
def alpha_entmax_loss(model, batch, args):
    longer_sample = batch[0].to(args.gpu)
    inp = longer_sample[:, :args.train_batch_size]
    model_output = model(input_ids=inp)
    target = longer_sample[:, 1:args.train_batch_size + 1]
    logits = model_output[0]
    alpha = torch.tensor([args.alpha],
                         requires_grad=True,
                         device=torch.device(args.gpu))
    probs = entmax_bisect(logits, alpha)
    loss = ((probs - F.one_hot(target, num_classes=probs.size(-1))) *
            logits).sum(-1)
    loss += alpha_entropy(probs, args.alpha)
    loss = loss.sum()

    true_token_logits = -F.nll_loss(logits[0], target[0], reduction='none')
    ntokens = inp.numel()

    arange = np.arange(probs.size(1))
    next_token_probs = probs[:, arange, target.squeeze().tolist()]
    voc_sizes = probs.size(-1)
    smoothed_nll = -torch.mean(
        torch.log((next_token_probs + args.laplas_eps) /
                  (1 + args.laplas_eps * voc_sizes)))

    logging_output = TrainingMetrics.ranking_metrics(logits[0].float(),
                                                     true_token_logits, None,
                                                     ntokens, target[0])
    logging_output['loss'] = loss.item()
    logging_output['smoothed_nll_loss'] = smoothed_nll.item()
    logging_output['normalizer'] = ntokens
    logging_output['sample_size'] = ntokens
    logging_output['ntokens'] = ntokens
    logging_output['js_div'] = jensen_shannon_divergence(probs,
                                                         target).mean().item()
    print(logging_output['js_div'])

    loss = loss / ntokens

    return loss, logging_output
Ejemplo n.º 5
0
def mle_loss(model, batch, args):
    longer_sample = batch[0].cuda()
    inp = longer_sample[:, :args.train_batch_size]
    model_output = model(inp)
    target = longer_sample[:, 1:]
    logits = model_output[0]
    lprobs = F.log_softmax(logits, dim=-1)
    assert lprobs.size(0) == 1, 'We work on flat sequences'
    loss = F.nll_loss(lprobs[0], target[0], reduction='sum')
    true_token_logits = -F.nll_loss(logits[0], target[0], reduction='none')
    ntokens = inp.numel()

    logging_output = TrainingMetrics.ranking_metrics(logits[0],
                                                     true_token_logits, None,
                                                     ntokens, target[0])
    logging_output['loss'] = loss.item()
    logging_output['normalizer'] = ntokens
    logging_output['sample_size'] = ntokens
    logging_output['ntokens'] = ntokens

    loss = loss / ntokens
    return loss, logging_output
    def aggregate_logging_outputs(logging_outputs):
        """Aggregate logging outputs from data parallel training."""
        loss_sum = sum(log.get('loss', 0) for log in logging_outputs)
        ntokens = sum(log.get('ntokens', 0) for log in logging_outputs)
        nsentences = sum(log.get('nsentences', 0) for log in logging_outputs)
        sample_size = sum(log.get('sample_size', 0) for log in logging_outputs)
        agg_output = {
            'loss':
            loss_sum / sample_size / math.log(2) if sample_size > 0 else 0.,
            'ntokens': ntokens,
            'nsentences': nsentences,
            'sample_size': sample_size,
        }
        from fairseq.custom.metrics import TrainingMetrics
        custom_output = TrainingMetrics.aggregate_and_normalize(
            logging_outputs)
        for k, v in custom_output.items():
            agg_output[k] = v

        if sample_size != ntokens:
            agg_output['nll_loss'] = loss_sum / ntokens / math.log(
                2) if ntokens > 0 else 0.
        return agg_output
    def forward(self, model, sample, reduce=True, compute_custom_metrics=True):
        """Compute the loss for the given sample.

        Returns a tuple with three elements:
        1) the loss
        2) the sample size, which is used as the denominator for the gradient
        3) logging outputs to display while training
        """
        net_output = model(**sample['net_input'])
        logits = net_output[0].view(-1, net_output[0].size(-1))
        target = model.get_targets(sample, net_output)
        target = target.view(-1)
        loss, _ = self.compute_loss(model, net_output, sample, reduce=reduce)
        sample_size = sample['target'].size(
            0) if self.args.sentence_avg else sample['ntokens']

        true_token_logits = -F.nll_loss(
            logits,
            target,
            ignore_index=self.padding_idx,
            reduction='none',  # I think this needs to be mean for batch case?
        )
        orig = utils.strip_pad(target, self.padding_idx)
        ntokens = orig.numel()

        logging_output = {
            'loss': utils.item(loss.data) if reduce else loss.data,
            'ntokens': sample['ntokens'],
            'nsentences': sample['target'].size(0),
            'sample_size': sample_size,
        }
        if compute_custom_metrics:
            custom_output = TrainingMetrics.ranking_metrics(
                logits, true_token_logits, sample, ntokens, target)
            for k, v in custom_output.items():
                logging_output[k] = v
        return loss, sample_size, logging_output
Ejemplo n.º 8
0
def eval_single_token_prediction(model,
                                 itr,
                                 dictionary,
                                 singletoken_topp=0.0,
                                 singletoken_topk=1):
    predicted_tokens = []
    target_tokens = []

    mle_loss_sum = 0
    num_samples_sum = 0
    wrong_mass_sum = 0

    logging_outputs = []

    for n, sample in tqdm(enumerate(itr)):
        sample = utils.move_to_cuda(sample)
        net_output = model(**sample['net_input'])
        logits = net_output[0][0]
        logits[:, dictionary.pad()] = -1e19
        predicted_tokens.append(logits.argmax(1).tolist())
        target = sample['target'].view(-1)
        target_tokens.append(target.tolist())

        # -- mle loss
        lprobs = model.get_normalized_probs(net_output, log_probs=True)
        lprobs = lprobs.view(-1, lprobs.size(-1))
        true_token_lprobs = F.nll_loss(
            lprobs,
            target,
            ignore_index=dictionary.pad_index,
            reduction='none',
        )
        true_token_logits = -F.nll_loss(
            logits,
            target,
            ignore_index=dictionary.pad_index,
            reduction='none',
        )
        mle_loss = true_token_lprobs.sum()
        orig = utils.strip_pad(target, dictionary.pad_index)
        ntokens = orig.numel()

        mle_loss_sum += mle_loss.item()
        num_samples_sum += ntokens

        logging_output = TrainingMetrics.ranking_metrics(logits,
                                                         true_token_logits,
                                                         sample,
                                                         ntokens,
                                                         target,
                                                         topk=singletoken_topk,
                                                         topp=singletoken_topp)

        negative_targets = (logits > true_token_logits[:, None]).float()
        wrong_mass_sum += (negative_targets * (F.softmax(logits, dim=1))).sum()

        logging_outputs.append(logging_output)

    ppl = math.pow(2, mle_loss_sum / num_samples_sum / math.log(2))
    custom_metrics = TrainingMetrics.aggregate_and_normalize(logging_outputs)
    custom_metrics['ppl'] = ppl
    avg_wrong_mass = wrong_mass_sum / num_samples_sum
    custom_metrics['avg_wrong_mass'] = avg_wrong_mass.item()
    return predicted_tokens, target_tokens, custom_metrics
def eval_singletoken(model,
                     args,
                     dataset_paths,
                     config,
                     top_k=1,
                     top_p=0.0,
                     t=1.0,
                     train_iter=None,
                     batch_size=None):
    alpha_entmax = args.alpha_entmax

    batch_size = batch_size if batch_size is not None else args.batch_size_singletoken
    datasets = get_datasets(dataset_paths, max_len=batch_size)
    eval_sampler = SequentialSampler(datasets[args.eval_split])
    eval_dataloader = DataLoader(
        datasets[args.eval_split], sampler=eval_sampler, batch_size=1)

    model.eval()

    logging_outputs = []
    predicted_tokens = []
    target_tokens = []
    with torch.no_grad():
        for i, batch in tqdm(enumerate(eval_dataloader),
                             desc="Evaluating", total=len(eval_dataloader)):
            longer_sample = batch[0].to(args.gpu)
            inp = longer_sample[:, :args.batch_size_singletoken]
            model_output = model(input_ids=inp)
            target = longer_sample[:, 1:]
            logits = model_output[0]
            log_softmax_probs = F.log_softmax(logits, dim=-1)
            nll = F.nll_loss(log_softmax_probs[0], target[0], reduction='sum')
            true_token_logits = - \
                F.nll_loss(logits[0], target[0], reduction='none')

            if alpha_entmax is False:
                filtered_logits = top_k_top_p_filtering(
                    logits.squeeze(0), top_k=args.top_k, top_p=args.top_p).unsqueeze(0)
                prev = F.softmax(
                    filtered_logits.view(filtered_logits.shape[1:]),
                    dim=-1).multinomial(num_samples=1).unsqueeze(0).squeeze(-1)
                probs = F.softmax(filtered_logits, dim=-1)
            else:
                probs = entmax_bisect(logits, torch.tensor(
                    [args.alpha], requires_grad=True, device=torch.device(args.gpu)).float())
            arange = np.arange(logits.size(1))

            next_token_probs = probs[:, arange, target.squeeze().tolist()]
            voc_sizes = probs.size(-1)
            smoothed_nll = -torch.mean(torch.log(
                (next_token_probs + args.laplas_eps) / (1 + args.laplas_eps * voc_sizes)
            ))

            pred = probs.view(-1, probs.size(-1)
                              ).multinomial(num_samples=1).view(probs.shape[:-1])
            predicted_tokens.extend(pred.view(-1).tolist())
            ntokens = inp.numel()

            rep_logits = torch.zeros_like(logits)
            rep_logits[:, arange, pred.squeeze().tolist()] = 1
            logging_output = TrainingMetrics.ranking_metrics(
                rep_logits[0].float(), true_token_logits, None, ntokens, target[0])
            logging_output['loss'] = nll.item()
            logging_output['smoothed_nll_loss'] = smoothed_nll.item()
            logging_output['normalizer'] = ntokens
            logging_output['sample_size'] = ntokens
            logging_output['ntokens'] = ntokens
            logging_output['js_div'] = jensen_shannon_divergence(
                probs, target).mean().item()
            if args.token_loss == 'alpha_entmax':
                loss = ((probs - F.one_hot(target,
                                           num_classes=probs.size(-1))) * logits).sum(-1)
                loss += alpha_entropy(probs, args.alpha)
                logging_output['alpha_entmax_loss'] = loss.mean().item()
            logging_outputs.append(logging_output)

            # for human uniq
            target_tokens.extend(target.view(-1).tolist())

    logging_average = CrossEntropyCriterionWCustomMetrics.aggregate_logging_outputs(
        logging_outputs)
    logging_average['e_ppl'] = np.exp(
        np.mean([x['smoothed_nll_loss'] for x in logging_outputs]))
    # aggregate_logging_outputs does division by log(2) of loss
    logging_average['ppl'] = 2**logging_average['loss']
    logging_average['human_uniq'] = len(set(target_tokens))
    logging_average['uniq'] = len(set(predicted_tokens))
    logging_average['wrep'] = np.mean(
        [v for k, v in logging_average.items() if k.startswith('wrong_repeat')])
    logging_average['rep'] = np.mean(
        [v for k, v in logging_average.items() if k.startswith('repeat')])
    logging_average['js_div'] = np.mean([x['js_div'] for x in logging_outputs])
    if args.token_loss == 'alpha_entmax':
        logging_average['alpha_entmax_loss'] = np.mean(
            [x['alpha_entmax_loss'] for x in logging_outputs])

    save_singletoken_sampling_metrics(
        logging_average,
        config.to_dict(),
        args,
        top_k=top_k,
        top_p=top_p,
        train_iter=train_iter)

    return logging_average
    def forward(self, model, sample, reduce=True, compute_custom_metrics=True):
        net_output = model(**sample['net_input'])
        target = model.get_targets(sample, net_output)
        nsentences = target.size(0)
        target = target.view(-1)

        # -- mle loss
        lprobs = model.get_normalized_probs(net_output, log_probs=True)
        lprobs = lprobs.view(-1, lprobs.size(-1))
        true_token_lprobs = F.nll_loss(
            lprobs,
            target,
            ignore_index=self.padding_idx,
            reduction='none',
        )
        mle_loss = true_token_lprobs.sum()

        # -- custom loss
        # Maximize (1 - p(x_nt)) for negative target tokens x_nt (equivalently minimize -log(1-p(x_nt)))

        # - form negative targets
        with torch.no_grad():
            # E.g. DABCC | D | EFFGD => {A,B,C} are negative targets.
            if self.candidate_type == 'prev_context':
                # Make 'the triangle'.
                ctx_cands = target.unsqueeze(0).expand(target.size(0), target.size(0))
                ctx_cands_ = (ctx_cands.tril(-1) + self.padding_idx)
                ctx_cands_ = ctx_cands_ * ctx_cands_.triu()
                ctx_cands = ctx_cands.tril(-1) + ctx_cands_

                # Don't include the target for that timestep as a negative target.
                ctx_cands = ctx_cands.masked_fill(ctx_cands == target.unsqueeze(1), self.padding_idx)
                negative_targets = torch.zeros_like(lprobs).scatter_(1, ctx_cands, 1)
            else:
                raise NotImplementedError('candidate type %s' % self.candidate_type)

        # - compute loss
        one_minus_probs = torch.clamp((1.0 - lprobs.exp()), min=1e-5)

        custom_loss = -torch.log(one_minus_probs)*negative_targets
        custom_loss = custom_loss.sum()

        loss = mle_loss + self.rank_alpha * custom_loss

        # -- metrics
        logits = net_output[0].view(-1, net_output[0].size(-1))
        true_token_logits = -F.nll_loss(
            logits,
            target,
            ignore_index=self.padding_idx,
            reduction='none',
        )

        orig = utils.strip_pad(target, self.padding_idx)
        ntokens = orig.numel()
        sample_size = sample['target'].size(0) if self.args.sentence_avg else ntokens

        logging_output = {
            'custom_loss': utils.item(custom_loss.data),
            'loss': utils.item(mle_loss.data),
            'ntokens': ntokens,
            'nsentences': nsentences,
            'sample_size': sample_size,
        }
        if compute_custom_metrics:
            custom_output = TrainingMetrics.ranking_metrics(logits, true_token_logits, sample, ntokens, target)
            for k, v in custom_output.items():
                logging_output[k] = v

        return loss, sample_size, logging_output