コード例 #1
0
 def __init__(self, agent, train_test, env, trunc, sampling):
     Metric.__init__(self, agent, train_test, "cider", "scalar", env, trunc,
                     sampling)
     self.score_function = Cider()
     self.tokenizer = PTBTokenizer()
     self.candidates = []
     self.refs = []
コード例 #2
0
class CiderMetric(Metric):
    def __init__(self, agent, train_test, env, trunc, sampling):
        Metric.__init__(self, agent, train_test, "cider", "scalar", env, trunc,
                        sampling)
        self.score_function = Cider()
        self.tokenizer = PTBTokenizer()
        self.candidates = []
        self.refs = []

    def fill_(self, **kwargs):
        pass

    def compute_(self, **kwargs):
        question_decoded = self.dataset.question_tokenizer.decode(
            kwargs["state"].text.numpy()[0],
            ignored=["<SOS>"],
            stop_at_end=True)
        ref_questions = kwargs["ref_questions_decoded"][0]
        self.candidates.append(question_decoded)
        self.refs.append([ref_questions])

    def post_treatment_(self):
        refs = {
            idx: list(map(_strip, ref))
            for (idx, ref) in enumerate(self.refs)
        }
        hyps = {
            idx: [lines.strip()]
            for (idx, lines) in enumerate(self.candidates)
        }
        score, scores = self.score_function.compute_score(refs, hyps)
        self.metric_history.extend(scores)
コード例 #3
0
ファイル: __init__.py プロジェクト: justdenz/nlg-eval
def compute_individual_metrics(ref, hyp, no_overlap=False, no_skipthoughts=False, no_glove=False):
    assert isinstance(hyp, six.string_types)

    if isinstance(ref, six.string_types):
        ref = ref.split('||<|>||')  # special delimiter for backward compatibility
    ref = [a.strip() for a in ref]
    refs = {0: ref}
    ref_list = [ref]

    hyps = {0: [hyp.strip()]}
    hyp_list = [hyp]

    ret_scores = {}
    if not no_overlap:
        scorers = [
            (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]),
            (Meteor(), "METEOR"),
            (Rouge(), "ROUGE_L"),
            (Cider(), "CIDEr")
        ]
        for scorer, method in scorers:
            score, scores = scorer.compute_score(refs, hyps)
            if isinstance(method, list):
                for sc, scs, m in zip(score, scores, method):
                    ret_scores[m] = sc
            else:
                ret_scores[method] = score
            if isinstance(scorer, Meteor):
                scorer.close()
        del scorers

    if not no_skipthoughts:
        from nlgeval.skipthoughts import skipthoughts
        import numpy as np
        from sklearn.metrics.pairwise import cosine_similarity

        model = skipthoughts.load_model()
        encoder = skipthoughts.Encoder(model)
        vector_hyps = encoder.encode([h.strip() for h in hyp_list], verbose=False)
        ref_list_T = np.array(ref_list).T.tolist()
        vector_refs = map(lambda refl: encoder.encode([r.strip() for r in refl], verbose=False), ref_list_T)
        cosine_similarity = list(map(lambda refv: cosine_similarity(refv, vector_hyps).diagonal(), vector_refs))
        cosine_similarity = np.max(cosine_similarity, axis=0).mean()
        ret_scores['SkipThoughtCS'] = cosine_similarity

    if not no_glove:
        from nlgeval.word2vec.evaluate import eval_emb_metrics
        import numpy as np

        glove_hyps = [h.strip() for h in hyp_list]
        ref_list_T = np.array(ref_list).T.tolist()
        glove_refs = map(lambda refl: [r.strip() for r in refl], ref_list_T)
        scores = eval_emb_metrics(glove_hyps, glove_refs)
        scores = scores.split('\n')
        for score in scores:
            name, value = score.split(':')
            value = float(value.strip())
            ret_scores[name] = value

    return ret_scores
コード例 #4
0
def compute_individual_metrics(ref,
                               hyp,
                               no_overlap=False,
                               no_skipthoughts=True,
                               no_glove=False):
    assert isinstance(hyp, six.string_types)

    if isinstance(ref, six.string_types):
        ref = ref.split(
            '||<|>||')  # special delimiter for backward compatibility
    ref = [a.strip() for a in ref]
    refs = {0: ref}
    ref_list = [ref]

    hyps = {0: [hyp.strip()]}
    hyp_list = [hyp]

    ret_scores = {}
    if not no_overlap:
        scorers = [(Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]),
                   (Meteor(), "METEOR"), (Rouge(), "ROUGE_L"),
                   (Cider(), "CIDEr")]
        for scorer, method in scorers:
            score, scores = scorer.compute_score(refs, hyps)
            if isinstance(method, list):
                for sc, scs, m in zip(score, scores, method):
                    ret_scores[m] = sc
            else:
                ret_scores[method] = score

    return ret_scores
コード例 #5
0
def compute_metrics(hypothesis,
                    references,
                    no_overlap=False,
                    no_skipthoughts=True,
                    no_glove=False):
    with open(hypothesis, 'r') as f:
        hyp_list = f.readlines()
    ref_list = []
    for iidx, reference in enumerate(references):
        with open(reference, 'r') as f:
            ref_list.append(f.readlines())
    ref_list = [list(map(_strip, refs)) for refs in zip(*ref_list)]
    refs = {idx: strippedlines for (idx, strippedlines) in enumerate(ref_list)}
    hyps = {idx: [lines.strip()] for (idx, lines) in enumerate(hyp_list)}
    assert len(refs) == len(hyps)

    ret_scores = {}
    if not no_overlap:
        scorers = [(Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]),
                   (Meteor(), "METEOR"), (Rouge(), "ROUGE_L"),
                   (Cider(), "CIDEr")]
        for scorer, method in scorers:
            score, scores = scorer.compute_score(refs, hyps)
            if isinstance(method, list):
                for sc, scs, m in zip(score, scores, method):
                    print("%s: %0.6f" % (m, sc))
                    ret_scores[m] = sc
            else:
                print("%s: %0.6f" % (method, score))
                ret_scores[method] = score
        del scorers

    return ret_scores
コード例 #6
0
 def load_scorers(self):
     self.scorers = [
         (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]),
         (Meteor(), "METEOR"),
         (Rouge(), "ROUGE_L"),
         (Cider(), "CIDEr")
     ]
コード例 #7
0
    def forward(self,
                images,
                features,
                tag_ids,
                input_ids,
                target,
                all_caps,
                beam_size,
                self_crit_seq_train=None):
        if self_crit_seq_train is None:
            preds_out = self.model(images, features, input_ids, tag_ids)
            loss = self.loss_fn(
                preds_out.reshape(-1, hyper_parameters['vocab_dim']),
                target.reshape(-1))
            return loss, preds_out

        else:  # self_crit_seq_train
            ref_list, hyp_list, scores = self.__generate__(
                images, features, tag_ids, all_caps, beam_size)

            refs = {idx: lines for (idx, lines) in enumerate(ref_list)}
            hyps = {idx: [lines] for (idx, lines) in enumerate(hyp_list)}
            _, reward = Cider().compute_score(refs, hyps)  # (N, beam_size)

            reward = torch.from_numpy(reward).to(device).view(scores.shape)
            reward_baseline = torch.mean(reward, dim=1, keepdim=True)

            loss = -scores * (reward - reward_baseline)
            loss = loss.mean()

            return loss, hyp_list[::beam_size]
コード例 #8
0
def evaluate_narrative_qa(ground_truth, predicted_answers):
  """Evaluation NarrativeQA predictions."""
  scorers = [(Bleu(4), ['Bleu_1', 'Bleu_2', 'Bleu_3', 'Bleu_4']),
             (Rouge(), 'ROUGE_L'), (Cider(), 'CIDEr')]

  def preprocess(text):
    return text.lower().rstrip(' .').strip()

  common_keys = [k for k in predicted_answers if k in ground_truth]
  refs = {k: [preprocess(s) for s in ground_truth[k]] for k in common_keys}
  hyps = {k: [preprocess(predicted_answers[k])] for k in common_keys}

  ret_scores = dict(common=len(common_keys))
  for scorer, method in scorers:
    score, scores = scorer.compute_score(refs, hyps)
    if isinstance(method, list):
      for sc, _, m in zip(score, scores, method):
        # print('%s: %0.6f' % (m, sc))
        ret_scores[m] = sc * 100
    else:
      # print('%s: %0.6f' % (method, score))
      ret_scores[method] = score * 100
    if isinstance(scorer, Meteor):
      scorer.close()
  del scorers
  return ret_scores
コード例 #9
0
    def load_scorers(self):
        self.scorers = []

        omit_bleu_i = False
        for i in range(1, 4 + 1):
            if 'Bleu_{}'.format(i) in self.metrics_to_omit:
                omit_bleu_i = True
                if i > 1:
                    self.scorers.append((Bleu(i - 1), ['Bleu_{}'.format(j) for j in range(1, i)]))
                break
        if not omit_bleu_i:
            self.scorers.append((Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]))

        if 'ROUGE_L' not in self.metrics_to_omit:
            self.scorers.append((Rouge(), "ROUGE_L"))
        if 'CIDEr' not in self.metrics_to_omit:
            self.scorers.append((Cider(), "CIDEr"))
コード例 #10
0
def compute_metrics(ref, hyp):
    # ref = ref.split('||<|>||')  # special delimiter
    #ref = [a.strip() for a in ref]
    refs = {0: [ref]}
    #ref_list = [ref]

    hyps = {0: [hyp.strip()]}
    hyp_list = [hyp]

    ret_scores = {}
    scorers = [(Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]),
               (Meteor(), "METEOR"), (Rouge(), "ROUGE_L"), (Cider(), "CIDEr")]
    for scorer, method in scorers:
        score, scores = scorer.compute_score(refs, hyps)
        if isinstance(method, list):
            for sc, scs, m in zip(score, scores, method):
                ret_scores[m] = sc
        else:
            ret_scores[method] = score
    return ret_scores
コード例 #11
0
def compute_metrics(gt_caps, pred_caps):
    assert len(gt_caps) == len(pred_caps)
    gt_caps = add_space_to_cap_dict(gt_caps)
    pred_caps = add_space_to_cap_dict(pred_caps)

    ret_scores = {}
    scorers = [(Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]),
               (Meteor(), "METEOR"), (Rouge(), "ROUGE_L"), (Cider(), "CIDEr")]
    for scorer, method in scorers:
        score, scores = scorer.compute_score(gt_caps, pred_caps)
        if isinstance(method, list):
            for sc, scs, m in zip(score, scores, method):
                print("%s: %0.6f" % (m, sc))
                ret_scores[m] = sc
        else:
            print("%s: %0.6f" % (method, score))
            ret_scores[method] = score
        if isinstance(scorer, Meteor):
            scorer.close()
    del scorers
    return ret_scores
コード例 #12
0
def compute_metrics_all(references, hypothesises):
    refs = {
        idx: [strippedlines.strip()]
        for (idx, strippedlines) in enumerate(references)
    }
    hyps = {idx: [lines.strip()] for (idx, lines) in enumerate(hypothesises)}
    assert len(refs) == len(hyps)

    ret_scores = {}
    scorers = [(Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]),
               (Meteor(), "METEOR"), (Rouge(), "ROUGE_L"), (Cider(), "CIDEr")]
    for scorer, method in scorers:
        score, scores = scorer.compute_score(refs, hyps)
        if isinstance(method, list):
            for sc, scs, m in zip(score, scores, method):
                #print("%s: %0.6f" % (m, sc))
                ret_scores[m] = sc
        else:
            #print("%s: %0.6f" % (method, score))
            ret_scores[method] = score
    return ret_scores
コード例 #13
0
ファイル: metric.py プロジェクト: zxr445116086/QG
def compute_metrics_by_file(references, hypothesis):
    """
    Given a list of gold file names and a predict result file,
    calculate metrics. Same line number corresponds to the same
    instance to calculate metric.
    Ref: https://github.com/Maluuba/nlg-eval
    :param references: list of gold file names.
    :param hypothesis: predict file name.
    :return: a list of metric results.
    """
    scorers = [(Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]),
               (Meteor(), "METEOR"), (Rouge(), "ROUGE_L"), (Cider(), "CIDEr")]

    def _strip(s):
        return s.strip()

    with open(hypothesis, encoding='utf-8') as f:
        hyp_list = f.readlines()
    ref_list = []
    for iidx, reference in enumerate(references):
        with open(reference, encoding='utf-8') as f:
            ref_list.append(f.readlines())
    ref_list = [list(map(_strip, refs)) for refs in zip(*ref_list)]
    refs = {idx: strippedlines for (idx, strippedlines) in enumerate(ref_list)}
    hyps = {idx: [lines.strip()] for (idx, lines) in enumerate(hyp_list)}
    assert len(refs) == len(hyps)

    ret_scores = {}

    for scorer, method in scorers:
        score, scores = scorer.compute_score(refs, hyps)
        if isinstance(method, list):
            for sc, scs, m in zip(score, scores, method):
                # print("%s: %0.6f" % (m, sc))
                ret_scores[m] = sc
        else:
            # print("%s: %0.6f" % (method, score))
            ret_scores[method] = score

    return ret_scores
コード例 #14
0
ファイル: __init__.py プロジェクト: weiwangthu/nlg-eval
def compute_metrics(hypothesis,
                    references,
                    no_overlap=False,
                    no_skipthoughts=False,
                    no_glove=False):
    with open(hypothesis, 'r') as f:
        hyp_list = f.readlines()
    ref_list = []
    for iidx, reference in enumerate(references):
        with open(reference, 'r') as f:
            ref_list.append(f.readlines())
    ref_list = [list(map(_strip, refs)) for refs in zip(*ref_list)]
    refs = {idx: strippedlines for (idx, strippedlines) in enumerate(ref_list)}
    hyps = {idx: [lines.strip()] for (idx, lines) in enumerate(hyp_list)}
    assert len(refs) == len(hyps)

    ret_scores = {}
    if not no_overlap:
        scorers = [(Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]),
                   (Meteor(), "METEOR"), (Rouge(), "ROUGE_L"),
                   (Cider(), "CIDEr")]
        for scorer, method in scorers:
            score, scores = scorer.compute_score(refs, hyps)
            if isinstance(method, list):
                for sc, scs, m in zip(score, scores, method):
                    print("%s: %0.6f" % (m, sc))
                    ret_scores[m] = sc
            else:
                print("%s: %0.6f" % (method, score))
                ret_scores[method] = score
            if isinstance(scorer, Meteor):
                scorer.close()
        del scorers

    if not no_skipthoughts:
        from nlgeval.skipthoughts import skipthoughts
        import numpy as np
        from sklearn.metrics.pairwise import cosine_similarity

        model = skipthoughts.load_model()
        encoder = skipthoughts.Encoder(model)
        vector_hyps = encoder.encode([h.strip() for h in hyp_list],
                                     verbose=False)
        ref_list_T = np.array(ref_list).T.tolist()
        vector_refs = map(
            lambda refl: encoder.encode([r.strip() for r in refl],
                                        verbose=False), ref_list_T)
        cosine_similarity = list(
            map(lambda refv: cosine_similarity(refv, vector_hyps).diagonal(),
                vector_refs))
        cosine_similarity = np.max(cosine_similarity, axis=0).mean()
        print("SkipThoughtsCosineSimilarity: %0.6f" % (cosine_similarity))
        ret_scores['SkipThoughtCS'] = cosine_similarity
        del model

    if not no_glove:
        from nlgeval.word2vec.evaluate import eval_emb_metrics
        import numpy as np

        glove_hyps = [h.strip() for h in hyp_list]
        ref_list_T = np.array(ref_list).T.tolist()
        glove_refs = map(lambda refl: [r.strip() for r in refl], ref_list_T)
        scores = eval_emb_metrics(glove_hyps, glove_refs)
        print(scores)
        scores = scores.split('\n')
        for score in scores:
            name, value = score.split(':')
            value = float(value.strip())
            ret_scores[name] = value

    return ret_scores
コード例 #15
0
def train(train_loader, decoder, decoder_optimizer, epoch, rev_word_map):
    """
    Performs one epoch's training.
    :param train_loader: DataLoader for training data
    :param decoder: decoder model
    :param criterion_ce: cross entropy loss layer
    :param criterion_dis : discriminative loss layer
    :param decoder_optimizer: optimizer to update decoder's weights
    :param epoch: epoch number
    """

    decoder.train()  # train mode (dropout and batchnorm is used)

    batch_time = AverageMeter()  # forward prop. + back prop. time
    data_time = AverageMeter()  # data loading time
    losses = AverageMeter()  # loss (per word decoded)
    top5accs = AverageMeter()  # top5 accuracy

    start = time.time()

    # Batches
    for i, (imgs, caps, caplens, allcaps) in enumerate(train_loader):
        data_time.update(time.time() - start)

        # Move to GPU, if available
        imgs = imgs.to(device)
        caps = caps.to(device)
        caplens = caplens.to(device)

        # Forward prop.
        scores,scores1, caps_sorted, decode_lengths, sort_ind = decoder(imgs, caps, caplens)

        # /!\ scores shape: (batch_size, max_captions_real_length,vocab_size)
        # scores[0, t, :]= proba(y[t]|y[1:t-1])

        # Since we decoded starting with <start>, the targets are all words after <start>, up to <end>
        targets = caps_sorted[:, 1:]  # (batch_size, max_caption_real_length)

        scores_copy = scores.clone()
        # Remove timesteps that we didn't decode at, or are pads
        # pack_padded_sequence is an easy trick to do this
        scores, _ = pack_padded_sequence(scores, decode_lengths, batch_first=True)
        targets, _ = pack_padded_sequence(targets, decode_lengths, batch_first=True)

        # Calculate cross entropy
        # crit = criterion_xe(scores, targets)

        # References
        references = list()
        allcaps = allcaps[sort_ind]  # because images were sorted in the decoder
        for j in range(allcaps.shape[0]):
            img_caps = allcaps[j].tolist()
            img_captions = list(map(lambda c: [rev_word_map[w] for w in c if
                                               w not in {word_map['<start>'], word_map['<pad>']}],
                                    img_caps))  # remove <start> and pads
            ref_caps = [' '.join(c) for c in img_captions]
            references.append(ref_caps)
         #print(references[-1])
        # Hypotheses
        hypotheses = list()
        _, preds = torch.max(scores_copy, dim=2)
        preds = preds.tolist()
        temp_preds = list()
        for j, p in enumerate(preds):
            temp_preds.append(preds[j][:decode_lengths[j]])  # remove pads
        preds = temp_preds
        # print(preds[0])
        preds_caption = list(map(lambda c: [rev_word_map[w] for w in c if
                                            w not in {word_map['<start>'], word_map['<pad>']}],
                                 preds))
        preds_caption = [' '.join(c) for c in preds_caption]

        hypotheses.extend(preds_caption)

        assert len(references) == len(hypotheses)

        # Sample decoding
        samples = list()
        proba = softmax(scores_copy, dim=2)
        B, T, V = proba.size()
        sampled = np.zeros((B, T), dtype=np.int32)
        sampled_entropy = torch.zeros([B, T]).to(device)
        for b in range(B):
            for t in range(decode_lengths[b]):
                sampled[b][t] = torch.multinomial(proba[b][t].view(-1), 1).item()
                sampled_entropy[b][t] = torch.log(proba[b][t][sampled[b][t]])
        temp_sampled = list()
        for j, p in enumerate(sampled):
            temp_sampled.append(sampled[j][:decode_lengths[j]])  # remove pads

        log_proba = torch.sum(sampled_entropy, dim=1)

        sampled_caption = list(
            map(lambda c: [rev_word_map[w] for w in c if w not in {word_map['<start>'], word_map['<pad>']}],
                temp_sampled))
        sampled_caption = [' '.join(c) for c in sampled_caption]

        samples.extend(sampled_caption)

        # print(samples)

        # Calculate loss
        cider = Cider()
        cider_ = Cider()

        baseline = torch.Tensor(compute_metric(cider_, references, hypotheses)).to(device)
        reward = torch.Tensor(compute_metric(cider, references, samples)).to(device)

        # print(log_proba.requires_grad)
        # loss = -(compute_metric(cider, references,samples) - compute_metric(cider_,references, hypotheses)) * crit
        loss = -torch.sum((reward-baseline) * log_proba)

        # Back prop.
        decoder_optimizer.zero_grad()
        loss.backward()

        # Clip gradients when they are getting too large
        torch.nn.utils.clip_grad_norm_(filter(lambda p: p.requires_grad, decoder.parameters()), 0.25)

        # Update weights
        decoder_optimizer.step()

        # Keep track of metrics
        top5 = accuracy(scores, targets, 5)
        losses.update(loss.item(), sum(decode_lengths))
        top5accs.update(top5, sum(decode_lengths))
        batch_time.update(time.time() - start)

        start = time.time()

        # Print status
        if i % print_freq == 0:
            print('Epoch: [{0}][{1}/{2}]\t'
                  'Batch Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                  'Data Load Time {data_time.val:.3f} ({data_time.avg:.3f})\t'
                  'Loss {loss.val:.6f} ({loss.avg:.4f})\t'
                  'Top-5 Accuracy {top5.val:.3f} ({top5.avg:.3f})'.format(epoch, i, len(train_loader),
                                                                          batch_time=batch_time,
                                                                          data_time=data_time, loss=losses,
                                                                          top5=top5accs))

            print('Reward : ', torch.mean(reward).item())
            print('Baseline : ', torch.mean(baseline).item())
コード例 #16
0
def validate(val_loader, decoder,rev_word_map):
    """
    Performs one epoch's validation.
    :param val_loader: DataLoader for validation data.
    :param decoder: decoder model
    :param criterion_ce: cross entropy loss layer
    :param criterion_dis : discriminative loss layer
    :return: BLEU-4 score
    """
    decoder.eval()  # eval mode (no dropout or batchnorm)

    batch_time = AverageMeter()
    losses = AverageMeter()
    top5accs = AverageMeter()

    start = time.time()

    references_ = list()  # references (true captions) for calculating BLEU-4 score
    hypotheses_ = list()  # hypotheses (predictions)

    # Batches
    with torch.no_grad():
        for i, (imgs, caps, caplens, allcaps) in enumerate(val_loader):

            # Move to GPU, if available
            imgs = imgs.to(device)
            caps = caps.to(device)
            caplens = caplens.to(device)

            # Forward prop.
            scores, scores1,caps_sorted, decode_lengths, sort_ind = decoder(imgs, caps, caplens)

            # /!\ scores shape: (batch_size, max_captions_real_length,vocab_size)
            # scores[0, t, :]= proba(y[t]|y[1:t-1])

            # Since we decoded starting with <start>, the targets are all words after <start>, up to <end>
            targets = caps_sorted[:, 1:]  # (batch_size, max_caption_real_length)

            scores_copy = scores.clone()
            # Remove timesteps that we didn't decode at, or are pads
            # pack_padded_sequence is an easy trick to do this
            scores, _ = pack_padded_sequence(scores, decode_lengths, batch_first=True)
            targets, _ = pack_padded_sequence(targets, decode_lengths, batch_first=True)

            # Calculate cross entropy
            # crit = criterion_xe(scores, targets)

            # References
            references = list()
            allcaps = allcaps[sort_ind]  # because images were sorted in the decoder
            for j in range(allcaps.shape[0]):
                img_caps = allcaps[j].tolist()
                img_captions = list(map(lambda c: [rev_word_map[w] for w in c if
                                                   w not in {word_map['<start>'],
                                                             word_map['<pad>']}],
                                        img_caps))  # remove <start> and pads
                ref_caps = [' '.join(c) for c in img_captions]
                references.append(ref_caps)

            # Hypotheses
            hypotheses = list()
            _, preds = torch.max(scores_copy, dim=2)
            preds = preds.tolist()
            temp_preds = list()
            for j, p in enumerate(preds):
                temp_preds.append(preds[j][:decode_lengths[j]])  # remove pads
            preds = temp_preds
            # print(preds[0])
            preds_caption = list(map(lambda c: [rev_word_map[w] for w in c if
                                                w not in {word_map['<start>'], word_map['<pad>']}],
                                     preds))
            preds_caption = [' '.join(c) for c in preds_caption]

            hypotheses.extend(preds_caption)

            assert len(references) == len(hypotheses)

            # Sample decoding
            samples = list()
            proba = softmax(scores_copy, dim=2)
            B, T, V = proba.size()
            sampled = np.zeros((B, T), dtype=np.int32)
            sampled_entropy = torch.zeros([B, T]).to(device)
            for b in range(B):
                for t in range(decode_lengths[b]):
                    sampled[b][t] = torch.multinomial(proba[b][t].view(-1), 1).item()
                    sampled_entropy[b][t] = torch.log(proba[b][t][sampled[b][t]])
            temp_sampled = list()
            for j, p in enumerate(sampled):
                temp_sampled.append(sampled[j][:decode_lengths[j]])  # remove pads

            log_proba = torch.sum(sampled_entropy, dim=1)

            sampled_caption = list(
                map(lambda c: [rev_word_map[w] for w in c if w not in {word_map['<start>'], word_map['<pad>']}],
                    temp_sampled))
            sampled_caption = [' '.join(c) for c in sampled_caption]

            samples.extend(sampled_caption)

            # print(samples)

            # Calculate loss
            cider = Cider()
            cider_ = Cider()

            baseline = torch.Tensor(compute_metric(cider_, references, hypotheses)).to(device)
            reward = torch.Tensor(compute_metric(cider, references, samples)).to(device)

            # print(log_proba.requires_grad)
            # loss = -(compute_metric(cider, references,samples) - compute_metric(cider_,references, hypotheses)) * crit
            loss = -torch.sum((reward-baseline) * log_proba)
            # Keep track of metrics
            losses.update(loss.item(), sum(decode_lengths))
            top5 = accuracy(scores, targets, 5)
            top5accs.update(top5, sum(decode_lengths))
            batch_time.update(time.time() - start)

            start = time.time()

            if i % print_freq == 0:
                print('Validation: [{0}/{1}]\t'
                      'Batch Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                      'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
                      'Top-5 Accuracy {top5.val:.3f} ({top5.avg:.3f})\t'.format(i, len(val_loader),
                                                                                batch_time=batch_time,
                                                                                loss=losses, top5=top5accs))

            # Store references (true captions), and hypothesis (prediction) for each image
            # If for n images, we have n hypotheses, and references a, b, c... for each image, we need -
            # references = [[ref1a, ref1b, ref1c], [ref2a, ref2b], ...], hypotheses = [hyp1, hyp2, ...]
            references_.extend(references)
            hypotheses_.extend(hypotheses)
         
    # Calculate BLEU-4 scores
    bleu4 = corpus_bleu(references_, hypotheses_)
    bleu4 = round(bleu4, 4)

    #calculate CIDEr
    avg_cider=Cider()
    #print(references)
    #print(hypotheses)
    print(len(compute_metric(avg_cider, references_, hypotheses_)))
    avg_reward=np.mean(compute_metric(avg_cider, references_, hypotheses_))
    print('val reward', avg_reward)
    print(
        '\n * LOSS - {loss.avg:.3f}, TOP-5 ACCURACY - {top5.avg:.3f}, BLEU-4 - {bleu} , CIDEr - {cidr}\n'.format(
            loss=losses,
            top5=top5accs,
            bleu=bleu4,
            cidr=avg_reward))

    return avg_reward
コード例 #17
0
ファイル: __init__.py プロジェクト: vipulbjj/code.fun.do
def compute_metrics(hypothesis, references, no_overlap=False, no_skipthoughts=False, no_glove=False):
    with open(hypothesis, 'r') as f:
        hyp_list = f.readlines()
    ref_list = []
    for iidx, reference in enumerate(references):
        with open(reference, 'r') as f:
            ref_list.append(f.readlines())
    ref_list = [map(str.strip, refs) for refs in zip(*ref_list)]
    refs = {idx: strippedlines for (idx, strippedlines) in enumerate(ref_list)}
    hyps = {idx: [lines.strip()] for (idx, lines) in enumerate(hyp_list)}
    assert len(refs) == len(hyps)

    ret_scores = {}
    ret1_scores={}
    if not no_overlap:
        scorers = [
            (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]),
            (Meteor(), "METEOR"),
            (Rouge(), "ROUGE_L"),
            (Cider(), "CIDEr")
        ]
        for scorer, method in scorers:
            score, scores = scorer.compute_score(refs, hyps)
            if isinstance(method, list):
                 for sc, scs, m in zip(score, scores, method):
                    # print("First print: %s:" %m)
                    # print("%s: %0.6f" % (m, sc))//giving BLEu scores
                    ret1_scores[m] = sc
            else:
               # print("Second print: %s: "%method)
                #print("%s: %0.6f" % (method, score))//gives meteor,rouge_l and cider
                ret_scores[method] = score
                #print(type(ret_scores))

    # if not no_skipthoughts:
    #     from nlgeval.skipthoughts import skipthoughts
    #     import numpy as np
    #     from sklearn.metrics.pairwise import cosine_similarity

    #     model = skipthoughts.load_model()
    #     encoder = skipthoughts.Encoder(model)
    #     vector_hyps = encoder.encode([h.strip() for h in hyp_list], verbose=False)
    #     ref_list_T = np.array(ref_list).T.tolist()
    #     vector_refs = map(lambda refl: encoder.encode([r.strip() for r in refl], verbose=False), ref_list_T)
    #     cosine_similarity = map(lambda refv: cosine_similarity(refv, vector_hyps).diagonal(), vector_refs)
    #     cosine_similarity = np.max(cosine_similarity, axis=0).mean()
    #     print("SkipThoughtsCosineSimilairty: %0.6f" % (cosine_similarity))
    #     ret_scores['SkipThoughtCS'] = cosine_similarity

    

    

    if not no_glove:
        from nlgeval.word2vec.evaluate import eval_emb_metrics
        import numpy as np

        glove_hyps = [h.strip() for h in hyp_list]
        ref_list_T = np.array(ref_list).T.tolist()
        glove_refs = map(lambda refl: [r.strip() for r in refl], ref_list_T)
        scores = eval_emb_metrics(glove_hyps, glove_refs)
        #print(scores)
        scores = scores.split('\n')
        for score in scores:
            name, value = score.split(':')
            value = float(value.strip())
            ret_scores[name] = value

    # return ret_scores

    ret_scores["METEOR"]=ret_scores["METEOR"]*a
    ret_scores["ROUGE_L"]=ret_scores["ROUGE_L"]*b
    ret_scores["CIDEr"]=ret_scores["CIDEr"]*c
    ret_scores["EmbeddingAverageCosineSimilairty"]=ret_scores["EmbeddingAverageCosineSimilairty"]*d
    ret_scores["VectorExtremaCosineSimilarity"]=ret_scores["VectorExtremaCosineSimilarity"]*e
   # ret_scores["GreedyMatchingScore"]=ret_scores["GreedyMatchingScore"]*f

    sum=0
    # for key in ret_scores:
    #     sum=sum+ret_scores[key]

    sum=ret_scores["METEOR"]+ret_scores["ROUGE_L"]+ret_scores["CIDEr"]+ret_scores["EmbeddingAverageCosineSimilairty"]+ret_scores["VectorExtremaCosineSimilarity"]

    marks=sum*maximum_marks
    
    print("Marks: %0.2f" % marks)