Ejemplo n.º 1
0
    def compute_score(self, gts, res):
        assert (list(gts) == list(res))
        imgIds = list(gts)

        bleu_scorer = BleuScorer(n=self._n)
        for id in imgIds:
            hypo = res[id]
            ref = gts[id]

            # Sanity check.
            assert (type(hypo) is list)
            assert (len(hypo) == 1)
            assert (type(ref) is list)
            assert (len(ref) >= 1)

            # Convert to UTF-8 if necessary
            if sys.version_info.major == 2:
                for j in range(len(hypo)):
                    if type(hypo[j]) == str:
                        hypo[j] = hypo[j].decode('utf-8')
                for j in range(len(ref)):
                    if type(ref[j]) == str:
                        ref[j] = ref[j].decode('utf-8')

            bleu_scorer += (hypo[0], ref)

        # score, scores = bleu_scorer.compute_score(option='shortest')
        score, scores = bleu_scorer.compute_score(option='closest', verbose=1)
        # score, scores = bleu_scorer.compute_score(option='average', verbose=1)

        # return (bleu, bleu_info)
        return score, scores
Ejemplo n.º 2
0
 def get_scores(self, preds, target):
     if self.bleu_scorer == 'coco':
         bleu_scorer = BleuScorer(n=self.bleu_order)
         coco = True
     else:
         coco = False
         scores = []
     # Go to sentence space to compute scores:
     hypo = decode_sequence(self.vocab, preds)  # candidate
     refs = decode_sequence(self.vocab, target.data)  # references
     num_img = target.size(0) // self.seq_per_img
     for e, h in enumerate(hypo):
         ix_start = e // self.seq_per_img * self.seq_per_img
         ix_end = ix_start + 5  # self.seq_per_img
         if coco:
             bleu_scorer += (h, refs[ix_start:ix_end])
         else:
             scores.append(
                 sentence_bleu(h,
                               ' '.join(refs[ix_start:ix_end]),
                               order=self.bleu_order))
     if coco:
         (score, scores) = bleu_scorer.compute_score()
         scores = scores[-1]
     self.logger.debug("Bleu scores: %s" % str(scores))
     return scores
Ejemplo n.º 3
0
    def compute_score(self, refs, hypos):
        '''

        :param refs: instance_num x refer_num x str
        :param hypos: instance_num x 1 x str
        :return:
        '''

        bleu_scorer = BleuScorer(n=self._n)
        for ref, hypo in zip(refs, hypos):

            # Sanity check.
            assert (type(hypo) is list)
            assert (len(hypo) == 1)
            assert (type(ref) is list)
            assert (len(ref) >= 1)

            bleu_scorer += (hypo[0], ref)

        #score, scores = bleu_scorer.compute_score(option='shortest')
        score, scores = bleu_scorer.compute_score(option='closest', verbose=1)
        #score, scores = bleu_scorer.compute_score(option='average', verbose=1)

        # return (bleu, bleu_info)
        return score, scores
Ejemplo n.º 4
0
def generate_refs(sentences, order_method, align_method, minus, sentid, simi_mat=None):
    tokens = [s['tokens'] for s in sentences]
    if align_method == 'soft':
        hiddens = [s['hidden'] for s in sentences]
    else:
        hiddens = None

    refs = generate_lattice(tokens, hiddens, order_method, align_method, simi_mat=simi_mat, minus=minus)

    for e in tokens:
        refs.add(' '.join(e))
    refs = list(refs)
    bleu_scorer = BleuScorer(n=4)
    for ref in refs:
        bleu_scorer += (ref, [' '.join(e) for e in tokens])
    score, scores = bleu_scorer.compute_score(option='closest', verbose=0)
    new_sentences = []
    for i, s in enumerate(scores[3]):
        new_ref = {}
        new_ref['imgid'] = sentences[0]['imgid']
        new_ref['raw'] = refs[i]
        new_ref['tokens'] = refs[i].split(' ')
        new_ref['sentid'] = sentid
        new_ref['bleu'] = s
        new_sentences.append(new_ref)
        sentid += 1

    return new_sentences
Ejemplo n.º 5
0
    def compute_score(self, gts, res):

        assert(gts.keys() == res.keys())
        imgIds = gts.keys()

        bleu_scorer = BleuScorer(n=self._n)
        for id in imgIds:
            hypo = res[id]
            ref = gts[id]

            # Sanity check.
            assert(type(hypo) is list)
            assert(len(hypo) == 1)
            assert(type(ref) is list)
            #print(ref)
            #assert(len(ref) > 1)

            bleu_scorer += (hypo[0], ref)

        #score, scores = bleu_scorer.compute_score(option='shortest')
        score, scores = bleu_scorer.compute_score(option='closest', verbose=1)
        #score, scores = bleu_scorer.compute_score(option='average', verbose=1)

        # return (bleu, bleu_info)
        return score, scores
Ejemplo n.º 6
0
 def compute_score(self, gts, res):
     
     assert(sorted(gts.keys()) == sorted(res.keys()))
     #imgIds = sorted(gts.keys())
     
     bleu_scorer = BleuScorer(n=self._n)
     for id in gts:
         hypo = res[id]
         ref = gts[id]
         
         # Sanity check.
         assert(type(hypo) is list)
         assert(len(hypo) == 1)
         assert(type(ref) is list)
         assert(len(ref) >= 1)
         
         bleu_scorer += (hypo[0], ref)
     
     # Reduce verbosity
     score, scores = bleu_scorer.compute_score(option='closest', verbose=0)
     
     # return (bleu, bleu_info)
     return score, scores
    def forward(
            self,  # type: ignore
            context: Dict[str, torch.LongTensor],
            image: torch.Tensor,
            caption: Dict[str, torch.LongTensor],
            face_embeds: torch.Tensor,
            obj_embeds: torch.Tensor,
            metadata: List[Dict[str, Any]],
            names: Dict[str, torch.LongTensor] = None,
            attn_idx=None) -> Dict[str, torch.Tensor]:

        caption_ids, target_ids, contexts = self._forward(
            context, image, caption, face_embeds, obj_embeds)
        decoder_out = self.decoder(caption, contexts)

        # Assume we're using adaptive loss
        loss, sample_size = self.criterion(self.decoder.adaptive_softmax,
                                           decoder_out, target_ids)

        loss = loss / math.log(2)

        output_dict = {
            'loss': loss / sample_size,
            'sample_size': sample_size,
        }

        # During evaluation, we will generate a caption and compute BLEU, etc.
        if not self.training and self.evaluate_mode:
            _, gen_ids, attns = self._generate(caption_ids, contexts, attn_idx)
            # We ignore <s> and <pad>
            gen_texts = [self.roberta.decode(x[x > 1]) for x in gen_ids.cpu()]
            captions = [m['caption'] for m in metadata]

            output_dict['captions'] = captions
            output_dict['generations'] = gen_texts
            output_dict['metadata'] = metadata
            output_dict['attns'] = attns
            output_dict['gen_ids'] = gen_ids.cpu().detach().numpy()

            # Remove punctuation
            gen_texts = [re.sub(r'[^\w\s]', '', t) for t in gen_texts]
            captions = [re.sub(r'[^\w\s]', '', t) for t in captions]

            for gen, ref in zip(gen_texts, captions):
                bleu_scorer = BleuScorer(n=4)
                bleu_scorer += (gen, [ref])
                score, _ = bleu_scorer.compute_score(option='closest')
                self.sample_history['bleu-1'] += score[0] * 100
                self.sample_history['bleu-2'] += score[1] * 100
                self.sample_history['bleu-3'] += score[2] * 100
                self.sample_history['bleu-4'] += score[3] * 100

                # rogue_scorer = Rouge()
                # score = rogue_scorer.calc_score([gen], [ref])
                # self.sample_history['rogue'] += score * 100

            if 'rare_tokens' in caption:
                for gen, ref, rare_list in zip(gen_texts, captions,
                                               caption['rare_tokens']):
                    bleu_scorer = BleuScorer(n=4)
                    rare_words = ' '.join(rare_list)
                    gen = gen + ' ' + rare_words

                    if rare_words:
                        print(ref)
                        print(gen)
                        print()

                    bleu_scorer += (gen, [ref])
                    score, _ = bleu_scorer.compute_score(option='closest')
                    self.sample_history['bleu-1r'] += score[0] * 100

        self.n_samples += caption_ids.shape[0]
        self.n_batches += 1

        return output_dict
Ejemplo n.º 8
0
    def forward(self,  # type: ignore
                context: Dict[str, torch.LongTensor],
                image: torch.Tensor,
                caption: Dict[str, torch.LongTensor],
                face_embeds: torch.Tensor,
                metadata: List[Dict[str, Any]],
                names=None) -> Dict[str, torch.Tensor]:

        caption_ids, target_ids, contexts, X_sections_hiddens, article_padding_mask = self._forward(
            context, image, caption, face_embeds)
        decoder_out = self.decoder(caption, contexts)

        # Assume we're using adaptive loss
        gen_loss, sample_size = self.criterion(
            self.decoder.adaptive_softmax, decoder_out, target_ids)

        entity_loss, copy_loss = self.pointer_loss(
            decoder_out, context, caption, target_ids, X_sections_hiddens, article_padding_mask)

        gen_loss = gen_loss / sample_size / math.log(2)
        entity_loss = entity_loss / math.log(2)
        copy_loss = copy_loss / math.log(2)

        loss = entity_loss + copy_loss

        if (self.training and not loss.requires_grad) or torch.isnan(loss):
            loss = None

        if not torch.isnan(gen_loss):
            self.batch_history['gen_loss'] += gen_loss.item()
        if not torch.isnan(entity_loss):
            self.batch_history['entity_loss'] += entity_loss.item()
        if not torch.isnan(copy_loss):
            self.batch_history['copy_loss'] += copy_loss.item()

        output_dict = {
            'loss': loss,
            'sample_size': sample_size,
        }

        # During evaluation, we will generate a caption and compute BLEU, etc.
        if not self.training and self.evaluate_mode:
            log_probs, copy_probs, should_copy_mask, gen_ids = self._generate(
                caption_ids, contexts, X_sections_hiddens, article_padding_mask, context)
            gen_texts = [self.roberta.decode(x[x > 1]) for x in gen_ids.cpu()]
            captions = [m['caption'] for m in metadata]

            copied_texts = [self.roberta.decode(x[should_copy_mask[i]])
                            for i, x in enumerate(gen_ids.cpu())]

            output_dict['captions'] = captions
            output_dict['generations'] = gen_texts
            output_dict['metadata'] = metadata
            output_dict['copied_texts'] = copied_texts

            # Remove punctuation
            gen_texts = [re.sub(r'[^\w\s]', '', t) for t in gen_texts]
            captions = [re.sub(r'[^\w\s]', '', t) for t in captions]

            for gen, ref in zip(gen_texts, captions):
                bleu_scorer = BleuScorer(n=4)
                bleu_scorer += (gen, [ref])
                score, _ = bleu_scorer.compute_score(option='closest')
                self.sample_history['bleu-1'] += score[0] * 100
                self.sample_history['bleu-2'] += score[1] * 100
                self.sample_history['bleu-3'] += score[2] * 100
                self.sample_history['bleu-4'] += score[3] * 100

                # rogue_scorer = Rouge()
                # score = rogue_scorer.calc_score([gen], [ref])
                # self.sample_history['rogue'] += score * 100

        self.n_samples += caption_ids.shape[0]
        self.n_batches += 1

        return output_dict
Ejemplo n.º 9
0
def main():
    args = docopt(__doc__, version='0.0.1')
    args = validate(args)

    if args['ptvsd']:
        address = ('0.0.0.0', args['ptvsd'])
        ptvsd.enable_attach(address)
        ptvsd.wait_for_attach()

    with open(args['counters'], 'rb') as f:
        counters = pickle.load(f)

    full_counter = counters['context'] + counters['caption']

    bleu_scorer = BleuScorer(n=4)
    rouge_scorer = Rouge()
    rouge_scores = []
    cider_scorer = CiderScorer(n=4, sigma=6.0)
    meteor_scorer = Meteor()
    meteor_scorer._stat = types.MethodType(_stat, meteor_scorer)
    meteor_scores = []
    eval_line = 'EVAL'
    meteor_scorer.lock.acquire()
    count = 0
    recalls, precisions = [], []
    rare_recall, rare_recall_total = 0, 0
    rare_precision, rare_precision_total = 0, 0
    full_recall, full_recall_total = 0, 0
    full_precision, full_precision_total = 0, 0
    full_rare_recall, full_rare_recall_total = 0, 0
    full_rare_precision, full_rare_precision_total = 0, 0
    lengths, gt_lengths = [], []
    n_uniques, gt_n_uniques = [], []

    gen_ttrs, cap_ttrs = [], []
    gen_flesch, cap_flesch = [], []

    ent_counter = defaultdict(int)

    with open(args['file']) as f:
        for line in tqdm(f):
            obj = json.loads(line)
            if args['use_processed']:
                caption = obj['caption']
                obj['caption_names'] = obj['processed_caption_names']
            else:
                caption = obj['raw_caption']

            generation = obj['generation']

            if obj['caption_names']:
                recalls.append(compute_recall(obj))
            if obj['generated_names']:
                precisions.append(compute_precision(obj))

            c, t = compute_full_recall(obj)
            full_recall += c
            full_recall_total += t

            c, t = compute_full_precision(obj)
            full_precision += c
            full_precision_total += t

            c, t = compute_rare_recall(obj, counters['caption'])
            rare_recall += c
            rare_recall_total += t

            c, t = compute_rare_precision(obj, counters['caption'])
            rare_precision += c
            rare_precision_total += t

            c, t = compute_rare_recall(obj, full_counter)
            full_rare_recall += c
            full_rare_recall_total += t

            c, t = compute_rare_precision(obj, full_counter)
            full_rare_precision += c
            full_rare_precision_total += t

            # Remove punctuation
            caption = re.sub(r'[^\w\s]', '', caption)
            generation = re.sub(r'[^\w\s]', '', generation)

            lengths.append(len(generation.split()))
            gt_lengths.append(len(caption.split()))

            n_uniques.append(len(set(generation.split())))
            gt_n_uniques.append(len(set(caption.split())))

            bleu_scorer += (generation, [caption])
            rouge_score = rouge_scorer.calc_score([generation], [caption])
            rouge_scores.append(rouge_score)
            cider_scorer += (generation, [caption])

            stat = meteor_scorer._stat(generation, [caption])
            eval_line += ' ||| {}'.format(stat)
            count += 1

            gen_ttrs.append(obj['gen_np']['basic_ttr'])
            cap_ttrs.append(obj['caption_np']['basic_ttr'])
            gen_flesch.append(obj['gen_readability']['flesch_reading_ease'])
            cap_flesch.append(
                obj['caption_readability']['flesch_reading_ease'])

            compute_entities(obj, ent_counter)

    meteor_scorer.meteor_p.stdin.write('{}\n'.format(eval_line).encode())
    meteor_scorer.meteor_p.stdin.flush()
    for _ in range(count):
        meteor_scores.append(
            float(meteor_scorer.meteor_p.stdout.readline().strip()))
    meteor_score = float(meteor_scorer.meteor_p.stdout.readline().strip())
    meteor_scorer.lock.release()

    blue_score, _ = bleu_scorer.compute_score(option='closest')
    rouge_score = np.mean(np.array(rouge_scores))
    cider_score, _ = cider_scorer.compute_score()

    final_metrics = {
        'BLEU-1': blue_score[0],
        'BLEU-2': blue_score[1],
        'BLEU-3': blue_score[2],
        'BLEU-4': blue_score[3],
        'ROUGE': rouge_score,
        'METEOR': meteor_score,
        'CIDEr': cider_score,
        'All names - recall': {
            'count':
            full_recall,
            'total':
            full_recall_total,
            'percentage':
            (full_recall / full_recall_total) if full_recall_total else None,
        },
        'All names - precision': {
            'count':
            full_precision,
            'total':
            full_precision_total,
            'percentage':
            (full_precision /
             full_precision_total) if full_precision_total else None,
        },
        'Caption rare names - recall': {
            'count':
            rare_recall,
            'total':
            rare_recall_total,
            'percentage':
            (rare_recall / rare_recall_total) if rare_recall_total else None,
        },
        'Caption rare names - precision': {
            'count':
            rare_precision,
            'total':
            rare_precision_total,
            'percentage':
            (rare_precision /
             rare_precision_total) if rare_precision_total else None,
        },
        'Article rare names - recall': {
            'count':
            full_rare_recall,
            'total':
            full_rare_recall_total,
            'percentage':
            (full_rare_recall /
             full_rare_recall_total) if full_rare_recall_total else None,
        },
        'Article rare names - precision': {
            'count':
            full_rare_precision,
            'total':
            full_rare_precision_total,
            'percentage':
            (full_rare_precision /
             full_rare_precision_total) if full_rare_precision_total else None,
        },
        'Length - generation': sum(lengths) / len(lengths),
        'Length - reference': sum(gt_lengths) / len(gt_lengths),
        'Unique words - generation': sum(n_uniques) / len(n_uniques),
        'Unique words - reference': sum(gt_n_uniques) / len(gt_n_uniques),
        'Caption TTR': sum(cap_ttrs) / len(cap_ttrs),
        'Generation TTR': sum(gen_ttrs) / len(gen_ttrs),
        'Caption Flesch Reading Ease': sum(cap_flesch) / len(cap_flesch),
        'Generation Flesch Reading Ease': sum(gen_flesch) / len(gen_flesch),
        'Entity all - recall': {
            'count':
            ent_counter['n_caption_ent_matches'],
            'total':
            ent_counter['n_caption_ents'],
            'percentage':
            ent_counter['n_caption_ent_matches'] /
            ent_counter['n_caption_ents'],
        },
        'Entity all - precision': {
            'count':
            ent_counter['n_gen_ent_matches'],
            'total':
            ent_counter['n_gen_ents'],
            'percentage':
            ent_counter['n_gen_ent_matches'] / ent_counter['n_gen_ents'],
        },
        'Entity person - recall': {
            'count':
            ent_counter['n_caption_person_matches'],
            'total':
            ent_counter['n_caption_persons'],
            'percentage':
            ent_counter['n_caption_person_matches'] /
            ent_counter['n_caption_persons'],
        },
        'Entity person - precision': {
            'count':
            ent_counter['n_gen_person_matches'],
            'total':
            ent_counter['n_gen_persons'],
            'percentage':
            ent_counter['n_gen_person_matches'] / ent_counter['n_gen_persons'],
        },
        'Entity GPE - recall': {
            'count':
            ent_counter['n_caption_gpes_matches'],
            'total':
            ent_counter['n_caption_gpes'],
            'percentage':
            ent_counter['n_caption_gpes_matches'] /
            ent_counter['n_caption_gpes'],
        },
        'Entity GPE - precision': {
            'count':
            ent_counter['n_gen_gpes_matches'],
            'total':
            ent_counter['n_gen_gpes'],
            'percentage':
            ent_counter['n_gen_gpes_matches'] / ent_counter['n_gen_gpes'],
        },
        'Entity ORG - recall': {
            'count':
            ent_counter['n_caption_orgs_matches'],
            'total':
            ent_counter['n_caption_orgs'],
            'percentage':
            ent_counter['n_caption_orgs_matches'] /
            ent_counter['n_caption_orgs'],
        },
        'Entity ORG - precision': {
            'count':
            ent_counter['n_gen_orgs_matches'],
            'total':
            ent_counter['n_gen_orgs'],
            'percentage':
            ent_counter['n_gen_orgs_matches'] / ent_counter['n_gen_orgs'],
        },
        'Entity DATE - recall': {
            'count':
            ent_counter['n_caption_date_matches'],
            'total':
            ent_counter['n_caption_date'],
            'percentage':
            ent_counter['n_caption_date_matches'] /
            ent_counter['n_caption_date'],
        },
        'Entity DATE - precision': {
            'count':
            ent_counter['n_gen_date_matches'],
            'total':
            ent_counter['n_gen_date'],
            'percentage':
            ent_counter['n_gen_date_matches'] / ent_counter['n_gen_date'],
        },
    }

    serialization_dir = os.path.dirname(args['file'])
    filename = os.path.basename(args['file']).split('.')[0]
    if args['use_processed']:
        filename += '_processed'

    output_file = os.path.join(serialization_dir,
                               f'{filename}_reported_metrics.json')
    with open(output_file, 'w') as file:
        json.dump(final_metrics, file, indent=4)

    for key, metric in final_metrics.items():
        print(f"{key}: {metric}")
Ejemplo n.º 10
0
        Caps[k]['gt']['scores'], Caps[k]['gen']['scores'])
    print("Mass distribution:", "gt:", sum(np.exp(Caps[k]['gt']['scores'])),
          "gen:", sum(np.exp(Caps[k]['gen']['scores'])))
    # print(Caps[k])

print('Gen:', np.unique(np.array(gens)))
print('Gt:', np.unique(np.array(gts)))

keys = np.array(list(Caps))
batches = np.array_split(keys, 1000)
print("Processing in %d batches" % len(batches))
cnt = 0
for batch in batches:
    cnt += 1
    cider_scorer = CiderScorer(n=4, sigma=6)
    bleu4 = BleuScorer(n=4)
    infer = []
    print('batch indices:', batch)
    for k in batch:
        # print('all caps:', Caps[k])
        refs = Caps[k]['gt']['sents']
        print("Refs:", refs)
        for e, ref in enumerate(refs):
            _refs = refs.copy()
            _refs.pop(e)
            cider_scorer += (ref, _refs)
            bleu4 += (ref, _refs)
        for c in Caps[k]['gen']['sents']:
            cider_scorer += (c, refs)
            bleu4 += (c, refs)
        infer += infer_cosine_gp(Caps[k]['gen']['sents'], refs)
Ejemplo n.º 11
0
def bleu():
    scorer = BleuScorer(n=4)
    scorer += (hypo[0], ref1)  # hypo[0] = 'word1 word2 word3 ...'
    # ref = ['word1 word2 word3 ...', 'word1 word2 word3 ...']
    score, _ = scorer.compute_score()
    print(score)