Exemple #1
0
def get_corpus_bleu(model, data_loader, vocabs, device, beam_size):
    import torch
    from pycocoevalcap.bleu.bleu import Bleu
    from pycocoevalcap.cider.cider import Cider
    from pycocoevalcap.rouge.rouge import Rouge
    """Defining Scorers"""
    scorer_bleu = Bleu(4)
    scorer_rouge = Rouge()
    scorer_cider = Cider()

    sequences_ref = {}
    sequences_gen = {}

    bad_words = ['<SOS>', '<EOS>', '<UNK>']
    bad_toks = [vocabs['word_vocab'](i) for i in bad_words]
    """Generation Loop"""
    for i, data in enumerate(data_loader):
        with torch.no_grad():
            captions = data['captions']
            length = captions.size(1) - 1
            targets = captions.narrow(1, 1, length)
            images = data['images'].to(device)
            topics = data['topics'].to(device)

            predictions = model.sample_v2(images, topics, beam_size=beam_size)
            sequences_ref[i] = [
                " ".join([
                    vocabs['word_vocab'](j.item()) for j in targets[0]
                    if j.item() not in bad_toks
                ])
            ]
            sequences_gen[i] = [
                " ".join([
                    vocabs['word_vocab'](j.item()) for j in predictions[0][1]
                    if j.item() not in bad_toks
                ])
            ]
            # sequences_gen[i] = [" ".join([vocabs['word_vocab'](j) for j in predictions[0] if j not in bad_toks])]
    """Getting Scores"""
    bleu_score, bleu_scores = scorer_bleu.compute_score(
        sequences_ref, sequences_gen)
    rouge_score, rouge_scores = scorer_rouge.compute_score(
        sequences_ref, sequences_gen)
    cider_score, cider_scores = scorer_cider.compute_score(
        sequences_ref, sequences_gen)
    scores = {
        'bleu_score': bleu_score,
        'rouge_score': rouge_score,
        'cider_score': cider_score
    }
    print(scores)
    return scores
Exemple #2
0
def compute_batch_score(decode_res,
                        key2refs,
                        keys,
                        start_idx,
                        end_idx,
                        vocabulary,
                        scorer):
    """
    Args:
        decode_res: decoding results of model, [N, max_length]
        key2refs: references of all samples, dict(<key> -> [ref_1, ref_2, ..., ref_n]
        keys: keys of this batch, used to match decode results and refs
    Return:
        scores of this batch, [N,]
    """

    if scorer is None:
        from pycocoevalcap.cider.cider import Cider
        scorer = Cider()

    hypothesis = {}
    references = {}

    for i in range(len(keys)):

        if keys[i] in hypothesis.keys():
            continue

        # prepare candidate sentence
        candidate = []
        for w_t in decode_res[i]:
            if w_t == start_idx:
                continue
            elif w_t == end_idx:
                break
            candidate.append(vocabulary.idx2word[w_t])

        hypothesis[keys[i]] = [" ".join(candidate), ]

        # prepare reference sentences
        references[keys[i]] = key2refs[keys[i]]

    score, scores = scorer.compute_score(references, hypothesis)
    key2score = {key: scores[i] for i, key in enumerate(references.keys())}
    results = np.zeros(decode_res.shape[0])
    for i in range(decode_res.shape[0]):
        results[i] = key2score[keys[i]]
    return results 
def _define_metrics(gts, res):
    bleu_scorer = Bleu(n=4)
    bleu, _ = bleu_scorer.compute_score(gts=gts, res=res)

    rouge_scorer = Rouge()
    rouge, _ = rouge_scorer.compute_score(gts=gts, res=res)

    cider_scorer = Cider()
    cider, _ = cider_scorer.compute_score(gts=gts, res=res)

    meteor_scorer = Meteor()
    meteor, _ = meteor_scorer.compute_score(gts=gts, res=res)

    for i in range(4):
        bleu[i] = round(bleu[i], 4)

    return bleu, round(meteor, 4), round(rouge, 4), round(cider, 4)
Exemple #4
0
def compute_cider_score(decode_res, keys, gts, start_idx, end_idx, vocabulary):
    """
    Args:
        decode_res: decoding results of model, [B, max_length]
        keys: keys of this batch, tuple [B,]
        gts: ground truth sentences of all audios, dict(<key> -> [ref_1, ref_2, ..., ref_n])
    Return:
        score: scores of this batch, [B,]
    """
    from pycocoevalcap.cider.cider import Cider
    scorer = Cider()

    hypothesis = {}
    references = {}

    for i in range(decode_res.shape[0]):

        if keys[i] in hypothesis:
            continue

        # prepare candidate
        candidate = []
        for t, w_t in enumerate(decode_res[i]):
            if w_t == start_idx:
                continue
            elif w_t == end_idx:
                break
            else:
                candidate.append(vocabulary.idx2word[w_t])
        hypothesis[keys[i]] = [
            " ".join(candidate),
        ]

        # prepare reference
        references[keys[i]] = gts[keys[i]]

    (score, scores) = scorer.compute_score(references, hypothesis)

    key2score = {key: scores[i] for i, key in enumerate(hypothesis.keys())}
    results = np.zeros(decode_res.shape[0])
    for i in range(decode_res.shape[0]):
        results[i] = key2score[keys[i]]

    return results
def eval(result_gts_path, result_res_path):
    with open(result_gts_path, 'r') as file:
        gts_dict = json.load(file)
    with open(result_res_path, 'r') as file:
        res_dict = json.load(file)

    bleu_score = Bleu(n=4)
    bleu, _ = bleu_score.compute_score(gts=gts_dict, res=res_dict)

    meteor_score = Meteor()
    meteor, _ = meteor_score.compute_score(gts=gts_dict, res=res_dict)

    rouge_scorer = Rouge()
    rouge, _ = rouge_scorer.compute_score(gts=gts_dict, res=res_dict)

    cider_scorer = Cider()
    cider, _ = cider_scorer.compute_score(gts=gts_dict, res=res_dict)

    return bleu, meteor, rouge, cider
Exemple #6
0
def compute_batch_score(decode_res, refs, keys, start_idx, end_idx, vocabulary,
                        scorer):
    """
    Args:
        decode_res: decoding results of model, [B, max_length]
        refs: references of all samples, dict(<key> -> [ref_1, ref_2, ..., ref_n]
        keys: keys of this batch, used to match decode results and refs
    Return:
        scores of this batch, [B,]
    """

    if scorer is None:
        from pycocoevalcap.cider.cider import Cider
        scorer = Cider()

    key2pred = {}
    key2refs = {}

    for i in range(len(keys)):

        # prepare candidate sentence
        candidate = []
        for w_t in decode_res[i]:
            if w_t == start_idx:
                continue
            elif w_t == end_idx:
                break
            candidate.append(vocabulary.idx2word[w_t])

        key2pred[i] = [
            " ".join(candidate),
        ]

        # prepare reference sentences
        key2refs[i] = refs[keys[i]]

    score, scores = scorer.compute_score(key2refs, key2pred)
    return scores
Exemple #7
0
def cider():
    scorer = Cider()
    # scorer += (hypo[0], ref1)
    (score, scores) = scorer.compute_score(gts, res)
    print('cider = %s' % score)
Exemple #8
0
def cider(gts, res):
    scorer = Cider()
    (score, scores) = scorer.compute_score(gts, res)
    out_file.write('CIDEr = %s' % score + '\n')
def coco_caption_metrics(predictions_list,
                         image_id_list,
                         vocabulary_path='data/vocabulary.json',
                         max_caption_length=25,
                         batch_size=32,
                         is_training=True):
    with open(vocabulary_path, 'r') as file:
        vocabulary_list = json.load(file)
    word2id = {}
    for i in range(vocabulary_list.__len__()):
        word2id[vocabulary_list[i]] = i
    id2word = {v: k for k, v in word2id.items()}

    with open('data/captions_gt.json', 'r') as file:
        captions_gt_dict = json.load(file)

    gts = {}
    res = {}
    for i in range(0, predictions_list.__len__()):
        for j in range(0, batch_size):
            sen_input, sen_ground_truth = [], []
            for k in range(max_caption_length):
                id_input = int(predictions_list[i][k][j])
                sen_input.append(id2word[id_input])

            sen_pre = []
            for n in range(max_caption_length):
                word = sen_input[n]
                if word != '</S>':
                    sen_pre.append(word)
                else:
                    break

            str_input = ' '.join(sen_pre)
            image_id = image_id_list[i][j][0]

            # print(image_id)
            res[image_id] = [str_input]
            gts[image_id] = captions_gt_dict[str(image_id)]

    if not is_training:
        # for key in gts.keys():
        #     str_input = res[key]
        #     str_grundtruth = gts[key]
        #     print(key)
        #     print(str_input)
        #     print(str_grundtruth)
        #     print('*' * 100)

        with open('data/result/result_res.json', 'w') as file:
            json.dump(res, file)
        with open('data/result/result_gts.json', 'w') as file:
            json.dump(gts, file)
        # print('result.json get success')

    bleu_scorer = Bleu(n=4)
    bleu, _ = bleu_scorer.compute_score(gts=gts, res=res)

    rouge_scorer = Rouge()
    rouge, _ = rouge_scorer.compute_score(gts=gts, res=res)

    cider_scorer = Cider()
    cider, _ = cider_scorer.compute_score(gts=gts, res=res)

    meteor_scorer = Meteor()
    meteor, _ = meteor_scorer.compute_score(gts=gts, res=res)

    for i in range(4):
        bleu[i] = round(bleu[i], 4)
    return bleu, round(meteor, 4), round(rouge, 4), round(cider, 4)
            
#        if(_count%1000==0):
#            print(_count) 
#            
#        _count=_count+1 
        
    print(' wmd_done\n time:  {}'.format(datetime.now()-time_now))
    print(' wmd_middle\n time:  {}'.format(time_now2-time_now1))
    #######################################################################################################
#    '''''''''''''''''''''''''''''''18'''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''
    print('--starting Cider--')
    df_mode='coco-val-df'
    time_now = datetime.now()
    CIDer=Cider(df=df_mode) # using coco-val-df as tf idf'
    
    cider_all, cider_scores=CIDer.compute_score(ref,hypo,ImgId)

    print('--Cider done--\n time:  {}'.format(datetime.now()-time_now))
    
    #######################################################################################################    
    '''''''''''''''''''''''''''''''19'''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''
   # Spice scores 
    spice_scores=[]
    with open(spice_score_path) as file:
        sp_file=json.load(file)
        
    for ids in ImgId:
        spice_scores.append(sp_file[str(ids)]) # meteor scores json was saved with str as keys 
           
    print(' Spice_done\n time:  {}'.format(datetime.now()-time_now))
        
Exemple #11
0
    def end_epoch(self, ):
        path = Path(Options()["exp.dir"])

        dirname = path.joinpath("generated_sentences")
        # Create directory if it does not exist
        if not os.path.exists(dirname):
            try:
                os.makedirs(dirname)
            except OSError as exc:  # Guard against race condition
                if exc.errno != errno.EEXIST:
                    raise

        # Dump sentences to the directory
        for field in ["action", "justification"]:
            for key in ["ground_truth", "predicted"]:
                filepath = dirname.joinpath("%s_%s.txt" % (key, field))
                with open(filepath, "w") as f:
                    f.write("\n".join(self.sentences[key][field]))

        # Compute NLP quality scores (bleu, meteor, cider...)
        for field in ["action", "justification"]:
            cider = Cider()
            bleu = Bleu()
            meteor = Meteor()

            # Check if this is not empty
            if len(self.sentences["ground_truth"][field]) > 0:
                ground_truth = {
                    i: [sentence]
                    for i, sentence in enumerate(self.sentences["ground_truth"]
                                                 [field])
                }
                predicted = {
                    i: [sentence]
                    for i, sentence in enumerate(self.sentences["predicted"]
                                                 [field])
                }

                cider_score, _ = cider.compute_score(ground_truth, predicted)
                cider_score = cider_score * 100  # Convert to percentage

                bleus_score, _ = bleu.compute_score(ground_truth, predicted)
                bleu_score = bleus_score[
                    3] * 100  # Take bleu-4 and convert to percentage

                meteor_score, _ = meteor.compute_score(ground_truth, predicted)
                meteor_score = meteor_score * 100  # Convert to percentage
            else:
                # Otherwise all scores are 0
                cider_score, bleu_score, meteor_score = 0, 0, 0

            Logger().log_value('%s_epoch.cider_%s' % (self.mode, field),
                               cider_score,
                               should_print=True)
            Logger().log_value('%s_epoch.bleucoco_%s' % (self.mode, field),
                               bleu_score,
                               should_print=True)
            Logger().log_value('%s_epoch.meteorcoco_%s' % (self.mode, field),
                               meteor_score,
                               should_print=True)

        # Reset sentences
        self.sentences = {
            "ground_truth": {
                "action": [],
                "justification": []
            },
            "predicted": {
                "action": [],
                "justification": []
            }
        }
        return
Exemple #12
0
class SelfCriticalSequenceTrainingCriterion(FairseqCriterion):
    def __init__(self, args, task):
        super().__init__(args, task)
        self.task = task

        self.generator = SimpleSequenceGenerator(
            beam=args.scst_beam,
            penalty=args.scst_penalty,
            max_pos=args.max_target_positions,
            eos_index=task.target_dictionary.eos_index)

        # Needed for decoding model output to string
        self.conf_tokenizer = encoders.build_tokenizer(args)
        self.conf_decoder = encoders.build_bpe(args)
        self.captions_dict = task.target_dictionary

        # Tokenizer needed for computing CIDEr scores
        self.tokenizer = PTBTokenizer()
        self.scorer = Cider()

    @staticmethod
    def add_args(parser):
        parser.add_argument('--scst-beam',
                            type=int,
                            default=5,
                            help='beam size')
        parser.add_argument('--scst-penalty',
                            type=float,
                            default=1.0,
                            help='beam search length penalty')
        parser.add_argument('--scst-validation-set-size',
                            type=int,
                            default=0,
                            metavar='N',
                            help='limited size of validation set')

    @property
    def image_ids(self):
        return self.task.dataset('train').img_ds.image_ids

    def decode(self, x):
        """Decode model output.
        """
        x = self.captions_dict.string(x)
        x = self.conf_decoder.decode(x)
        return self.conf_tokenizer.decode(x)

    def generate(self, model, sample):
        """Generate captions using (simple) beam search.
        """
        tgt_captions = dict()
        gen_captions = dict()

        scores, _, tokens, _ = self.generator.generate(model, sample)

        counter = 0
        for i, tb in enumerate(tokens):
            image_id = self.image_ids[i]
            image_captions = sample['target'][i]

            for t in tb:
                counter += 1
                decoded = self.decode(t)
                tgt_captions[counter] = image_captions
                gen_captions[counter] = [{
                    'image_id': image_id,
                    'caption': decoded,
                    'id': 1
                }]

        gen_captions = self.tokenizer.tokenize(gen_captions)
        return tgt_captions, gen_captions, scores

    def forward(self, model, sample, reduce=True):
        sample_indices = sample['id']
        sample_device = sample_indices.device

        tgt_captions, gen_captions, scores = self.generate(model, sample)

        _, reward = self.scorer.compute_score(tgt_captions, gen_captions)
        reward = torch.from_numpy(reward).to(device=sample_device).view(
            scores.shape)

        # Mean of rewards is used as baseline rather than greedy
        # decoding (see also https://arxiv.org/abs/1912.08226).
        reward_baseline = torch.mean(reward, dim=1, keepdim=True)

        loss = -scores * (reward - reward_baseline)
        loss = loss.mean()

        sample_nsentences = sample['nsentences']
        sample_ntokens = sample['ntokens']

        logging_output = {
            'loss': loss.data,
            'ntokens': sample_ntokens,
            'nsentences': sample_nsentences,
            'sample_size': sample_nsentences,
        }
        return loss, sample_nsentences, logging_output

    @staticmethod
    def aggregate_logging_outputs(logging_outputs):
        return {
            'loss':
            sum(log.get('loss', 0) for log in logging_outputs),
            'ntokens':
            sum(log.get('ntokens', 0) for log in logging_outputs),
            'nsentences':
            sum(log.get('nsentences', 0) for log in logging_outputs),
            'sample_size':
            sum(log.get('sample_size', 0) for log in logging_outputs)
        }
Exemple #13
0
    rouge, _ = rouge_obj.compute_score(wtd, wrd)

    rouges.append(rouge)

print(np.mean(rouges))

with open("%s-rouges.txt" % system, 'w') as outf:
    for r in rouges:
        outf.write(str(r) + '\n')

for i in range(len(ref1_strs)):
    word_target_dict[i] = [ref1_strs[i], ref2_strs[i]]
    word_response_dict[i] = [sys_strs[i]]

bleu_score, bleu_scores = bleu_obj.compute_score(word_target_dict,
                                                 word_response_dict)
bleu1_score, _, _, bleu4_score = bleu_score
bleu1_scores, _, _, bleu4_scores = bleu_scores
meteor_score, meteor_scores = meteor_obj.compute_score(word_target_dict,
                                                       word_response_dict)
rouge_score, rouge_scores = rouge_obj.compute_score(word_target_dict,
                                                    word_response_dict)
cider_score, cider_scores = cider_obj.compute_score(word_target_dict,
                                                    word_response_dict)

print("ROUGE-L: ", rouge_score)
print("BLEU-1: ", bleu1_score)
print("BLEU-4: ", bleu4_score)
print("METEOR: ", meteor_score)
print("CiDER: ", cider_score)
Exemple #14
0
def coco_caption_metrics_hier(predicts_list,
                              sentences_list,
                              image_id_list,
                              config,
                              batch_size=26,
                              is_training=True):
    with open(config.vocabulary_path, 'r') as file:
        vocabulary_list = json.load(file)
    word2id = {}
    for i in range(vocabulary_list.__len__()):
        word2id[vocabulary_list[i]] = i
    id2word = {v: k for k, v in word2id.items()}

    gts = {}
    res = {}
    for i in range(0, predicts_list.__len__()):
        for j in range(0, batch_size):
            sent_pre, sent_gt = [], []
            for k in range(config.max_sentence_num *
                           config.max_sentence_length):
                id_input = int(predicts_list[i][k][j])
                sent_pre.append(id2word[id_input])

                id_gt = sentences_list[i][j][k]
                if (not id2word[id_gt].__eq__('</S>')) and (
                        not id2word[id_gt].__eq__('<EOS>')):
                    sent_gt.append(id2word[id_gt])

            # sent_pre2 = sent_pre
            sent_pre2 = []
            for n in range(config.max_sentence_num):
                for m in range(config.max_sentence_length):
                    word = sent_pre[n * config.max_sentence_length + m]
                    if word != '</S>':
                        sent_pre2.append(word)
                    else:
                        break

            str_pre, str_gt = ' '.join(sent_pre2), ' '.join(sent_gt)
            image_id = image_id_list[i][j][0]
            gts[str(image_id)] = [str_gt]
            res[str(image_id)] = [str_pre]

    if not is_training:
        with open(config.result_gts_path, 'w') as file:
            json.dump(gts, file)
        with open(config.result_res_path, 'w') as file:
            json.dump(res, file)

    bleu_scorer = Bleu(n=4)
    bleu, _ = bleu_scorer.compute_score(gts=gts, res=res)

    rouge_scorer = Rouge()
    rouge, _ = rouge_scorer.compute_score(gts=gts, res=res)

    cider_scorer = Cider()
    cider, _ = cider_scorer.compute_score(gts=gts, res=res)
    # #
    # meteor_scorer = Meteor()
    # meteor, _ = meteor_scorer.compute_score(gts=gts, res=res)

    for i in range(4):
        bleu[i] = round(bleu[i], 4)

    # return bleu, round(meteor, 4), round(rouge, 4), round(cider, 4)
    return bleu, round(rouge, 4), round(cider, 4)
Exemple #15
0
def evaluate(beam_size):
    """
    Evaluation

    :param beam_size: beam size at which to generate captions for evaluation
    :return: BLEU-4 score
    """
    # DataLoader
    loader = torch.utils.data.DataLoader(CaptionDataset(
        data_folder,
        data_name,
        'TEST',
        transform=transforms.Compose([normalize])),
                                         batch_size=1,
                                         shuffle=True,
                                         num_workers=0,
                                         pin_memory=False)

    # TODO: Batched Beam Search
    # Therefore, do not use a batch_size greater than 1 - IMPORTANT!

    # Lists to store references (true captions), and hypothesis (prediction) for each image
    # If for n images, we have n hypotheses, and references a, b, c... for each image, we need -
    # references = [[ref1a, ref1b, ref1c], [ref2a, ref2b], ...], hypotheses = [hyp1, hyp2, ...]
    references = dict()
    hypotheses = dict()

    # For each image
    for j, (image, caps, caplens, allcaps) in enumerate(
            tqdm(loader, desc="EVALUATING AT BEAM SIZE " + str(beam_size))):

        k = beam_size

        # Move to GPU device, if available
        image = image.to(device)  # (1, 3, 256, 256)

        attrs, encoder_out = encoder(image)
        attrs = attrs.expand(3, attrs_dim)

        enc_image_size = encoder_out.size(1)
        encoder_dim = encoder_out.size(3)
        encoder_out = encoder_out.view(1, -1, encoder_dim)
        num_pixels = encoder_out.size(1)
        encoder_out = encoder_out.expand(k, num_pixels, encoder_dim)
        x0 = decoder.init_x0(attrs)

        # Tensor to store top k previous words at each step; now they're just <start>
        k_prev_words = torch.LongTensor([[word_map['<start>']]] * k).to(
            device)  # (k, 1)

        # Tensor to store top k sequences; now they're just <start>
        seqs = k_prev_words  # (k, 1)

        # Tensor to store top k sequences' scores; now they're just 0
        top_k_scores = torch.zeros(k, 1).to(device)  # (k, 1)

        # Lists to store completed sequences and scores
        complete_seqs = list()
        complete_seqs_scores = list()

        # Start decoding
        step = 1
        h1, c1, h2, c2 = decoder.init_hidden_state(attrs,
                                                   encoder_out,
                                                   zero=True)
        h1, c1 = decoder.decode_step1(x0, (h1, c1))
        # s is a number less than or equal to k, because sequences are removed from this process once they hit <end>
        while True:

            embeddings = decoder.embedding(k_prev_words).squeeze(
                1)  # (s, embed_dim)

            h1, c1 = decoder.decode_step1(embeddings, (h1, c1))

            awe, _ = decoder.attention(encoder_out, h1, h2)
            # gate = decoder.sigmoid(decoder.f_beta(h2))
            # awe = gate * awe

            h2, c2 = decoder.decode_step2(torch.cat([embeddings, awe], dim=1),
                                          (h2, c2))

            scores = decoder.fc2(decoder.dropout2(h2))
            scores = F.log_softmax(scores, dim=1)

            # Add
            scores = top_k_scores.expand_as(scores) + scores  # (s, vocab_size)

            # For the first step, all k points will have the same scores (since same k previous words, h, c)
            if step == 1:
                top_k_scores, top_k_words = scores[0].topk(k, 0, True,
                                                           True)  # (s)
            else:
                # Unroll and find top scores, and their unrolled indices
                # (s) 所有分数中最大的k个
                top_k_scores, top_k_words = scores.view(-1).topk(
                    k, 0, True, True)

            # Convert unrolled indices to actual indices of scores
            # 上面展开了,prev_word_inds得到哪些句子是概率最大的
            prev_word_inds = top_k_words / vocab_size  # (s)
            next_word_inds = top_k_words % vocab_size  # (s)

            # Add new words to sequences
            seqs = torch.cat(
                [seqs[prev_word_inds],
                 next_word_inds.unsqueeze(1)], dim=1)  # (s, step+1)

            # Which sequences are incomplete (didn't reach <end>)?
            incomplete_inds = [
                ind for ind, next_word in enumerate(next_word_inds)
                if next_word != word_map['<end>']
            ]
            complete_inds = list(
                set(range(len(next_word_inds))) - set(incomplete_inds))

            # Set aside complete sequences
            if len(complete_inds) > 0:
                complete_seqs.extend(seqs[complete_inds].tolist())
                complete_seqs_scores.extend(top_k_scores[complete_inds])
            k -= len(complete_inds)  # reduce beam length accordingly

            # Proceed with incomplete sequences
            if k == 0:
                break
            seqs = seqs[incomplete_inds]
            h1 = h1[prev_word_inds[incomplete_inds]]
            c1 = c1[prev_word_inds[incomplete_inds]]
            h2 = h2[prev_word_inds[incomplete_inds]]
            c2 = c2[prev_word_inds[incomplete_inds]]
            encoder_out = encoder_out[prev_word_inds[incomplete_inds]]

            top_k_scores = top_k_scores[incomplete_inds].unsqueeze(1)
            k_prev_words = next_word_inds[incomplete_inds].unsqueeze(1)

            # Break if things have been going on too long
            if step > 50:
                break
            step += 1

        i = complete_seqs_scores.index(max(complete_seqs_scores))
        seq = complete_seqs[i]

        # References
        img_caps = allcaps[0].tolist()
        img_captions = list(
            map(
                lambda c: [
                    rev_word_map[w] for w in c if w not in {
                        word_map['<start>'], word_map['<end>'], word_map[
                            '<pad>']
                    }
                ], img_caps))  # remove <start> and pads
        img_caps = [' '.join(c) for c in img_captions]
        # print(img_caps)
        references[str(j)] = img_caps

        # Hypotheses
        hypothesis = ([
            rev_word_map[w] for w in seq if w not in
            {word_map['<start>'], word_map['<end>'], word_map['<pad>']}
        ])
        hypothesis = [' '.join(hypothesis)]
        # print(hypothesis)
        hypotheses[str(j)] = hypothesis

        assert len(references) == len(hypotheses)

    # Calculate BLEU-1~BLEU4 scores
    m1 = Bleu()
    m2 = Meteor()
    m3 = Cider()
    m4 = Rouge()
    m5 = Spice()
    (score1, scores1) = m1.compute_score(references, hypotheses)
    (score2, scores2) = m2.compute_score(references, hypotheses)
    (score3, scores3) = m3.compute_score(references, hypotheses)
    (score4, scores4) = m4.compute_score(references, hypotheses)
    (score5, scores5) = m5.compute_score(references, hypotheses)

    return score1, score2, score3, score4, score5
Exemple #16
0
 def eval_cv(engine, key2pred, key2refs):
     scorer = Cider(zh=zh)
     score, scores = scorer.compute_score(key2refs, key2pred)
     engine.state.metrics["score"] = score
     key2pred.clear()
def run_load_gap_filler(pretrained_filename,
                        do_bleu=False,
                        must_have_anp=False,
                        copy_if_no_anp=False,
                        replace_adj=False,
                        get_human=False,
                        semi_human=False):
    rnn = RNNModel()
    rnn.load_model(pretrained_filename)
    rnn.conf['VAL_SPLIT'] = RNNDataProvider.TEST

    if get_human:
        id_to_caps = pickle.load(open("coco_mturk/id_to_caps.pik", "rb"))

    rnn.build_model_core()
    rnn.load_val_dataset()

    rnn.build_sentence_generator()

    rnn.build_perplexity_calculator()
    #print rnn.sample_sentence(rnn.V_valid[0])
    #print decoder_beamsearch2(rnn, rnn.V_valid[0])
    #print decoder_beamsearch(rnn, rnn.V_valid[0])

    #calculate_metric(rnn)
    #sys.exit(0)

    pos_sentence_res = []
    pos_att_res = []

    des_sentence_res = []
    des_att_res = []

    img_files = []
    img_ids = []

    id_to_sentences = {}

    seen_ids = set()
    if 'added_words' in rnn.conf:
        new_words = set([w[0] for w in rnn.conf['added_words']])
    else:
        new_words = set()
    num_ignore = 0
    num_not_ignore = 0
    for idx in range(rnn.V_valid.shape[0]):
        img_file = rnn.dp.img_id_to_filename[rnn.Id_valid[idx]]
        img_id = rnn.Id_valid[idx]
        if img_id not in id_to_sentences: id_to_sentences[img_id] = []
        #id_to_sentences[img_id].append(' '.join([rnn.dp.i2w[w] for w in rnn.X_valid[idx] if w != 0][::-1]))
        if replace_adj:
            id_to_sentences[img_id] = [
                ' '.join(do_replace_adj(rnn.dp.tokens[i])[::-1])
                for i in rnn.dp.img_id_to_tokens[img_id]
            ]
        elif get_human:
            id_to_sentences[img_id] = [
                ' '.join(rnn.dp.tokens[i][::-1])
                for i in rnn.dp.img_id_to_tokens[img_id]
            ]
            np.random.shuffle(id_to_sentences[img_id])
            print(len(id_to_sentences[img_id]))
            human_sen_pos = id_to_sentences[img_id].pop()
            print(len(id_to_sentences[img_id]))
            if not id_to_sentences[img_id]: continue
        else:
            id_to_sentences[img_id] = [
                ' '.join(rnn.dp.tokens[i][::-1])
                for i in rnn.dp.img_id_to_tokens[img_id]
            ]
        #print id_to_sentences[img_id]
        if img_id in seen_ids: continue
        seen_ids.add(img_id)
        if get_human and not semi_human:
            pos_sen = human_sen_pos.split()[::-1]
            np.random.shuffle(id_to_caps[img_id])
            des_sen = id_to_caps[img_id][0][::-1]
        else:
            lp, pos_sen, pos_att = decoder_beamsearch_with_attention(
                rnn, rnn.V_valid[idx], senti=1.0, beam_size=5)
            lp, des_sen, des_att = decoder_beamsearch_with_attention(
                rnn, rnn.V_valid[idx], senti=-1.0, beam_size=5)
            pos_sen = pos_sen[:-1]
            des_sen = des_sen[:-1]
            #des_att = des_att[:-1]
            pos_att = pos_att[:-1]
        #pos_sen, pos_att = rnn.get_sentence(rnn.V_valid[idx], senti=np.array([1.0], dtype=theano.config.floatX))
        pos_att = np.array(pos_att)
        pos_att = pos_att.flatten()
        #des_att = np.array(des_att)
        #des_att = des_att.flatten()
        des_att = np.zeros((len(des_sen), ))
        #pos_att = np.zeros((len(pos_sen),))
        if must_have_anp:
            if not sentence_has_anp(pos_sen[::-1]):
                num_ignore += 1
                continue
            num_not_ignore += 1
        if copy_if_no_anp:
            if not sentence_has_anp(pos_sen[::-1]):
                pos_sen = des_sen
        if replace_adj:
            pos_sen = do_replace_adj(pos_sen[::-1])[::-1]
            des_sen = do_replace_adj(des_sen[::-1])[::-1]

        #des_sen, des_att = rnn.get_sentence(rnn.V_valid[idx], senti=np.array([-1.0], dtype=theano.config.floatX))
        new_pos_sen = []
        for vv, a in zip(pos_sen, pos_att):
            out = vv
            col = ""
            if a > 0.75:
                col = "#FF3300"
            elif a > 0.5:
                col = "#FF5C33"
            elif a > 0.25:
                col = "#FF8566"
            #if a > 0.75:
            #    col = "#33CC33"# "#3366FF"
            #elif a > 0.5:
            #    col = "#70DB70" #"#5C85FF"
            #elif a > 0.25:
            #    col = "#ADEBAD" #"#85A3FF"
            if col:
                out = "<font style='background-color: %s'>%s</font>" % (col,
                                                                        vv)
            new_pos_sen.append(out)
        pos_sen = new_pos_sen
        print(pos_sen)
        print(pos_att)
        print(des_sen)
        print_it = False
        for v in pos_sen:
            if v in new_words:
                print_it = True
        if print_it:
            for x in zip(pos_sen, pos_att)[::-1]:
                print(x[0], end=' ')
            print("")
        #for x in zip(pos_sen, pos_att)[::-1]:
        #    print x[0],
        #print ""
        #for x in zip(des_sen, des_att)[::-1]:
        #    print x[0],
        #print "\n"
        pos_att = pos_att[:len(pos_sen)]
        des_att = des_att[:len(des_sen)]
        pos_sentence_res.append(pos_sen[::-1])
        pos_att_res.append(np.exp(pos_att[::-1]))
        des_sentence_res.append(des_sen[::-1])
        des_att_res.append(np.exp(des_att[::-1]))
        img_files.append(img_file)
        img_ids.append(img_id)

    output = {
        'pos_sen': pos_sentence_res,
        'pos_att': pos_att_res,
        'des_sen': des_sentence_res,
        'des_att': des_att_res,
        'img_files': img_files,
        'img_ids': img_ids
    }
    pickle.dump(output,
                open("output_data/sen_att_pos_01.pik", "wb"),
                protocol=2)

    if must_have_anp:
        print("Must have ANP % removed:",
              num_ignore / float(num_not_ignore) * 100.0)

    print("getting Positive perplexity")
    print(rnn.get_val_perplexity())
    print("got perplexity")

    print("getting Descriptive perplexity")
    print(rnn.get_val_perplexity(base=True))
    print("got perplexity")

    gts = {}
    res = {}
    fout = open("eval/output_pos", "w")
    for line, iid in zip(pos_sentence_res, img_ids):
        fout.write(' '.join(line) + '\n')
        if iid not in res: res[iid] = []
        res[iid].append(' '.join(line))
    fout.close()

    res_des = {}
    fout = open("eval/output_des", "w")
    for line, iid in zip(des_sentence_res, img_ids):
        fout.write(' '.join(line) + '\n')
        if iid not in res_des: res_des[iid] = []
        res_des[iid].append(' '.join(line))
    fout.close()

    for i in range(3):
        fout = open("eval/reference%d" % i, "w")
        for cid in img_ids:
            if cid not in gts: gts[cid] = []
            if len(id_to_sentences[cid]) > i:
                gts[cid].append(id_to_sentences[cid][i])
                fout.write(id_to_sentences[cid][i] + "\n")
            else:
                fout.write("\n")
        fout.close()

    bleu = Bleu()
    #for i in gts.keys()[:10]:
    #    print gts[i]
    #    print res_des[i]
    #    print res[i]
    #    print ""
    total_ref_sentences = 0
    for i in list(gts.keys()):
        total_ref_sentences += len(gts[i])
    print("Total ref sentences:", total_ref_sentences)
    print("Bleu:")
    print("Positive:", bleu.compute_score(gts, res)[0])
    print("Descriptive:", bleu.compute_score(gts, res_des)[0])
    rouge = Rouge()
    print("Rouge:")
    print("Positive:", rouge.compute_score(gts, res)[0])
    print("Descriptive:", rouge.compute_score(gts, res_des)[0])
    cider = Cider()
    print("Cider:")
    print("Positive:", cider.compute_score(gts, res)[0])
    print("Descriptive:", cider.compute_score(gts, res_des)[0])
    meteor = Meteor()
    print("Meteor:")
    print("Positive:", meteor.compute_score(gts, res)[0])
    print("Descriptive:", meteor.compute_score(gts, res_des)[0])