Beispiel #1
0
def test(model, dataloader, args):
    scorer = Bleu(4)
    m_scorer = Meteor()
    r_scorer = Rouge()
    hyp = []
    ref = []
    model.eval()
    gold_file = open('tmp_gold.txt', 'w')
    pred_file = open('tmp_pred.txt', 'w')
    with tqdm(dataloader, desc='Test ',  mininterval=1) as tq:
        for batch in tq:
            with torch.no_grad():
                seq = model(batch, beam_size=args.beam_size)
            r = write_txt(batch, batch['tgt_text'], gold_file, args)
            h = write_txt(batch, seq, pred_file, args)
            hyp.extend(h)
            ref.extend(r)
    hyp = dict(zip(range(len(hyp)), hyp))
    ref = dict(zip(range(len(ref)), ref))
    print(hyp[0], ref[0])
    print('BLEU INP', len(hyp), len(ref))
    print('BLEU', scorer.compute_score(ref, hyp)[0])
    print('METEOR', m_scorer.compute_score(ref, hyp)[0])
    print('ROUGE_L', r_scorer.compute_score(ref, hyp)[0])
    gold_file.close()
    pred_file.close()
Beispiel #2
0
def language_eval_excoco(predictions, predictions_bleu, sents_label_eval,
                         loader):

    Scorer = CiderD()
    Bleu_scorer = Bleu(4)
    METEOR_scorer = Meteor()
    ROUGE_scorer = Rouge()

    c_score, _ = Scorer.compute_score(sents_label_eval, predictions)
    b_score, _ = Bleu_scorer.compute_score(sents_label_eval, predictions_bleu)
    m_score, _ = METEOR_scorer.compute_score(sents_label_eval,
                                             predictions_bleu)
    r_score, _ = ROUGE_scorer.compute_score(sents_label_eval, predictions_bleu)

    print('Evaluating {} samples'.format(len(predictions)))

    print('Bleu_1 : ' + str(b_score[0]))
    print('Bleu_2 : ' + str(b_score[1]))
    print('Bleu_3 : ' + str(b_score[2]))
    print('Bleu_4 : ' + str(b_score[3]))
    print('METEOR : ' + str(m_score))
    print('ROUGE_L : ' + str(r_score))
    print('CIDEr : ' + str(c_score))

    lang_stat = {}
    lang_stat['BLEU_1'] = b_score[0]
    lang_stat['BLEU_2'] = b_score[1]
    lang_stat['BLEU_3'] = b_score[2]
    lang_stat['BLEU_4'] = b_score[3]
    lang_stat['METEOR'] = m_score
    lang_stat['ROUGE_L'] = r_score
    lang_stat['CIDEr'] = c_score

    return lang_stat
        def rouge_scorer(reference, hypothesis):
            # =================================================
            # Compute scores
            # =================================================
            scorer = Rouge()

            average_score, score = scorer.compute_score(reference, hypothesis)

            return average_score, score
Beispiel #4
0
def get_corpus_bleu(model, data_loader, vocabs, device, beam_size):
    import torch
    from pycocoevalcap.bleu.bleu import Bleu
    from pycocoevalcap.cider.cider import Cider
    from pycocoevalcap.rouge.rouge import Rouge
    """Defining Scorers"""
    scorer_bleu = Bleu(4)
    scorer_rouge = Rouge()
    scorer_cider = Cider()

    sequences_ref = {}
    sequences_gen = {}

    bad_words = ['<SOS>', '<EOS>', '<UNK>']
    bad_toks = [vocabs['word_vocab'](i) for i in bad_words]
    """Generation Loop"""
    for i, data in enumerate(data_loader):
        with torch.no_grad():
            captions = data['captions']
            length = captions.size(1) - 1
            targets = captions.narrow(1, 1, length)
            images = data['images'].to(device)
            topics = data['topics'].to(device)

            predictions = model.sample_v2(images, topics, beam_size=beam_size)
            sequences_ref[i] = [
                " ".join([
                    vocabs['word_vocab'](j.item()) for j in targets[0]
                    if j.item() not in bad_toks
                ])
            ]
            sequences_gen[i] = [
                " ".join([
                    vocabs['word_vocab'](j.item()) for j in predictions[0][1]
                    if j.item() not in bad_toks
                ])
            ]
            # sequences_gen[i] = [" ".join([vocabs['word_vocab'](j) for j in predictions[0] if j not in bad_toks])]
    """Getting Scores"""
    bleu_score, bleu_scores = scorer_bleu.compute_score(
        sequences_ref, sequences_gen)
    rouge_score, rouge_scores = scorer_rouge.compute_score(
        sequences_ref, sequences_gen)
    cider_score, cider_scores = scorer_cider.compute_score(
        sequences_ref, sequences_gen)
    scores = {
        'bleu_score': bleu_score,
        'rouge_score': rouge_score,
        'cider_score': cider_score
    }
    print(scores)
    return scores
def _define_metrics(gts, res):
    bleu_scorer = Bleu(n=4)
    bleu, _ = bleu_scorer.compute_score(gts=gts, res=res)

    rouge_scorer = Rouge()
    rouge, _ = rouge_scorer.compute_score(gts=gts, res=res)

    cider_scorer = Cider()
    cider, _ = cider_scorer.compute_score(gts=gts, res=res)

    meteor_scorer = Meteor()
    meteor, _ = meteor_scorer.compute_score(gts=gts, res=res)

    for i in range(4):
        bleu[i] = round(bleu[i], 4)

    return bleu, round(meteor, 4), round(rouge, 4), round(cider, 4)
Beispiel #6
0
def eval(result_gts_path, result_res_path):
    with open(result_gts_path, 'r') as file:
        gts_dict = json.load(file)
    with open(result_res_path, 'r') as file:
        res_dict = json.load(file)

    bleu_score = Bleu(n=4)
    bleu, _ = bleu_score.compute_score(gts=gts_dict, res=res_dict)

    meteor_score = Meteor()
    meteor, _ = meteor_score.compute_score(gts=gts_dict, res=res_dict)

    rouge_scorer = Rouge()
    rouge, _ = rouge_scorer.compute_score(gts=gts_dict, res=res_dict)

    cider_scorer = Cider()
    cider, _ = cider_scorer.compute_score(gts=gts_dict, res=res_dict)

    return bleu, meteor, rouge, cider
class RougeBleuScore(Metric):

    def __init__(self, coco, vocab, n = 4):
        self.coco = coco
        self.vocab = vocab
        self.bleu = Bleu(n)
        self.n = n
        self.rouge = Rouge()

    def evaluate(self, y_pred, y, image_ids):
        if type(y_pred) == list:
            caption_pred_list = caption_list_to_words(y_pred, self.vocab)
        else:
            caption_pred_list = tensor_to_words(y_pred, y, self.vocab)
        captions_pred, captions_gt = extract_captions(image_ids, caption_pred_list, self.coco)
        blockPrint()
        scores = self.bleu.compute_score(captions_gt, captions_pred)[0]
        enablePrint()
        scores.append(self.rouge.compute_score(captions_gt, captions_pred)[0])
        return scores
def calculate_metric(rnn, meteor=None):
    gts = {}
    res = {}
    lp_avg = 0.0
    lp_c = 0
    for idx in range(rnn.V_valid.shape[0]):
        iid = rnn.Id_valid[idx]
        if iid not in gts: gts[iid] = []
        #gts[iid].append(' '.join([rnn.dp.i2w[w] for w in rnn.X_valid[idx] if w != 0][::-1]))
        gts[iid] = [
            ' '.join(rnn.dp.tokens[i][::-1])
            for i in rnn.dp.img_id_to_tokens[iid]
        ]
        if iid in res: continue
        res[iid] = []
        #pos_sen, pos_att = rnn.get_sentence(rnn.V_valid[idx], senti=np.array([1.0], dtype=theano.config.floatX))
        (lp, pos_sen) = decoder_beamsearch(rnn,
                                           rnn.V_valid[idx],
                                           senti=1.0,
                                           beam_size=1)
        pos_sen = pos_sen[:-1]
        print(' '.join(pos_sen[::-1]))
        res[iid].append(' '.join(pos_sen[::-1]))
        lp_avg += np.exp(lp)
        lp_c += 1
    lp_avg /= float(lp_c)
    return lp_avg

    bleu = Bleu()
    print("Bleu:")
    print("Positive:", bleu.compute_score(gts, res)[0])
    rouge = Rouge()
    print("Rouge:")
    print("Positive:", rouge.compute_score(gts, res)[0])
    if meteor is None:
        meteor = Meteor()
    print("Meteor:")
    mscore = meteor.compute_score(gts, res)[0]
    print("Positive:", mscore)
    return mscore
Beispiel #9
0
def rouge():
    scorer = Rouge()
    score, scores = scorer.compute_score(gts, res)
    print('rouge = %s' % score)
Beispiel #10
0
def rouge(gts, res):
    scorer = Rouge()
    score, scores = scorer.compute_score(gts, res)
    out_file.write('ROUGE = %s' % score + '\n')
Beispiel #11
0
def coco_caption_metrics(predictions_list,
                         image_id_list,
                         vocabulary_path='data/vocabulary.json',
                         max_caption_length=25,
                         batch_size=32,
                         is_training=True):
    with open(vocabulary_path, 'r') as file:
        vocabulary_list = json.load(file)
    word2id = {}
    for i in range(vocabulary_list.__len__()):
        word2id[vocabulary_list[i]] = i
    id2word = {v: k for k, v in word2id.items()}

    with open('data/captions_gt.json', 'r') as file:
        captions_gt_dict = json.load(file)

    gts = {}
    res = {}
    for i in range(0, predictions_list.__len__()):
        for j in range(0, batch_size):
            sen_input, sen_ground_truth = [], []
            for k in range(max_caption_length):
                id_input = int(predictions_list[i][k][j])
                sen_input.append(id2word[id_input])

            sen_pre = []
            for n in range(max_caption_length):
                word = sen_input[n]
                if word != '</S>':
                    sen_pre.append(word)
                else:
                    break

            str_input = ' '.join(sen_pre)
            image_id = image_id_list[i][j][0]

            # print(image_id)
            res[image_id] = [str_input]
            gts[image_id] = captions_gt_dict[str(image_id)]

    if not is_training:
        # for key in gts.keys():
        #     str_input = res[key]
        #     str_grundtruth = gts[key]
        #     print(key)
        #     print(str_input)
        #     print(str_grundtruth)
        #     print('*' * 100)

        with open('data/result/result_res.json', 'w') as file:
            json.dump(res, file)
        with open('data/result/result_gts.json', 'w') as file:
            json.dump(gts, file)
        # print('result.json get success')

    bleu_scorer = Bleu(n=4)
    bleu, _ = bleu_scorer.compute_score(gts=gts, res=res)

    rouge_scorer = Rouge()
    rouge, _ = rouge_scorer.compute_score(gts=gts, res=res)

    cider_scorer = Cider()
    cider, _ = cider_scorer.compute_score(gts=gts, res=res)

    meteor_scorer = Meteor()
    meteor, _ = meteor_scorer.compute_score(gts=gts, res=res)

    for i in range(4):
        bleu[i] = round(bleu[i], 4)
    return bleu, round(meteor, 4), round(rouge, 4), round(cider, 4)
Beispiel #12
0
def evaluate(beam_size):
    """
    Evaluation

    :param beam_size: beam size at which to generate captions for evaluation
    :return: BLEU-4 score
    """
    # DataLoader
    loader = torch.utils.data.DataLoader(CaptionDataset(
        data_folder,
        data_name,
        'TEST',
        transform=transforms.Compose([normalize])),
                                         batch_size=1,
                                         shuffle=True,
                                         num_workers=0,
                                         pin_memory=False)

    # TODO: Batched Beam Search
    # Therefore, do not use a batch_size greater than 1 - IMPORTANT!

    # Lists to store references (true captions), and hypothesis (prediction) for each image
    # If for n images, we have n hypotheses, and references a, b, c... for each image, we need -
    # references = [[ref1a, ref1b, ref1c], [ref2a, ref2b], ...], hypotheses = [hyp1, hyp2, ...]
    references = dict()
    hypotheses = dict()

    # For each image
    for j, (image, caps, caplens, allcaps) in enumerate(
            tqdm(loader, desc="EVALUATING AT BEAM SIZE " + str(beam_size))):

        k = beam_size

        # Move to GPU device, if available
        image = image.to(device)  # (1, 3, 256, 256)

        attrs, encoder_out = encoder(image)
        attrs = attrs.expand(3, attrs_dim)

        enc_image_size = encoder_out.size(1)
        encoder_dim = encoder_out.size(3)
        encoder_out = encoder_out.view(1, -1, encoder_dim)
        num_pixels = encoder_out.size(1)
        encoder_out = encoder_out.expand(k, num_pixels, encoder_dim)
        x0 = decoder.init_x0(attrs)

        # Tensor to store top k previous words at each step; now they're just <start>
        k_prev_words = torch.LongTensor([[word_map['<start>']]] * k).to(
            device)  # (k, 1)

        # Tensor to store top k sequences; now they're just <start>
        seqs = k_prev_words  # (k, 1)

        # Tensor to store top k sequences' scores; now they're just 0
        top_k_scores = torch.zeros(k, 1).to(device)  # (k, 1)

        # Lists to store completed sequences and scores
        complete_seqs = list()
        complete_seqs_scores = list()

        # Start decoding
        step = 1
        h1, c1, h2, c2 = decoder.init_hidden_state(attrs,
                                                   encoder_out,
                                                   zero=True)
        h1, c1 = decoder.decode_step1(x0, (h1, c1))
        # s is a number less than or equal to k, because sequences are removed from this process once they hit <end>
        while True:

            embeddings = decoder.embedding(k_prev_words).squeeze(
                1)  # (s, embed_dim)

            h1, c1 = decoder.decode_step1(embeddings, (h1, c1))

            awe, _ = decoder.attention(encoder_out, h1, h2)
            # gate = decoder.sigmoid(decoder.f_beta(h2))
            # awe = gate * awe

            h2, c2 = decoder.decode_step2(torch.cat([embeddings, awe], dim=1),
                                          (h2, c2))

            scores = decoder.fc2(decoder.dropout2(h2))
            scores = F.log_softmax(scores, dim=1)

            # Add
            scores = top_k_scores.expand_as(scores) + scores  # (s, vocab_size)

            # For the first step, all k points will have the same scores (since same k previous words, h, c)
            if step == 1:
                top_k_scores, top_k_words = scores[0].topk(k, 0, True,
                                                           True)  # (s)
            else:
                # Unroll and find top scores, and their unrolled indices
                # (s) 所有分数中最大的k个
                top_k_scores, top_k_words = scores.view(-1).topk(
                    k, 0, True, True)

            # Convert unrolled indices to actual indices of scores
            # 上面展开了,prev_word_inds得到哪些句子是概率最大的
            prev_word_inds = top_k_words / vocab_size  # (s)
            next_word_inds = top_k_words % vocab_size  # (s)

            # Add new words to sequences
            seqs = torch.cat(
                [seqs[prev_word_inds],
                 next_word_inds.unsqueeze(1)], dim=1)  # (s, step+1)

            # Which sequences are incomplete (didn't reach <end>)?
            incomplete_inds = [
                ind for ind, next_word in enumerate(next_word_inds)
                if next_word != word_map['<end>']
            ]
            complete_inds = list(
                set(range(len(next_word_inds))) - set(incomplete_inds))

            # Set aside complete sequences
            if len(complete_inds) > 0:
                complete_seqs.extend(seqs[complete_inds].tolist())
                complete_seqs_scores.extend(top_k_scores[complete_inds])
            k -= len(complete_inds)  # reduce beam length accordingly

            # Proceed with incomplete sequences
            if k == 0:
                break
            seqs = seqs[incomplete_inds]
            h1 = h1[prev_word_inds[incomplete_inds]]
            c1 = c1[prev_word_inds[incomplete_inds]]
            h2 = h2[prev_word_inds[incomplete_inds]]
            c2 = c2[prev_word_inds[incomplete_inds]]
            encoder_out = encoder_out[prev_word_inds[incomplete_inds]]

            top_k_scores = top_k_scores[incomplete_inds].unsqueeze(1)
            k_prev_words = next_word_inds[incomplete_inds].unsqueeze(1)

            # Break if things have been going on too long
            if step > 50:
                break
            step += 1

        i = complete_seqs_scores.index(max(complete_seqs_scores))
        seq = complete_seqs[i]

        # References
        img_caps = allcaps[0].tolist()
        img_captions = list(
            map(
                lambda c: [
                    rev_word_map[w] for w in c if w not in {
                        word_map['<start>'], word_map['<end>'], word_map[
                            '<pad>']
                    }
                ], img_caps))  # remove <start> and pads
        img_caps = [' '.join(c) for c in img_captions]
        # print(img_caps)
        references[str(j)] = img_caps

        # Hypotheses
        hypothesis = ([
            rev_word_map[w] for w in seq if w not in
            {word_map['<start>'], word_map['<end>'], word_map['<pad>']}
        ])
        hypothesis = [' '.join(hypothesis)]
        # print(hypothesis)
        hypotheses[str(j)] = hypothesis

        assert len(references) == len(hypotheses)

    # Calculate BLEU-1~BLEU4 scores
    m1 = Bleu()
    m2 = Meteor()
    m3 = Cider()
    m4 = Rouge()
    m5 = Spice()
    (score1, scores1) = m1.compute_score(references, hypotheses)
    (score2, scores2) = m2.compute_score(references, hypotheses)
    (score3, scores3) = m3.compute_score(references, hypotheses)
    (score4, scores4) = m4.compute_score(references, hypotheses)
    (score5, scores5) = m5.compute_score(references, hypotheses)

    return score1, score2, score3, score4, score5
Beispiel #13
0
with open(system, 'r') as f:
    for line in f:
        sys_strs.append(line.strip())

assert len(ref1_strs) == len(ref2_strs)
assert len(ref2_strs) == len(sys_strs)

word_target_dict = {}
word_response_dict = {}

rouges = []
for i in range(len(ref1_strs)):
    wtd = {i: [ref1_strs[i], ref2_strs[i]]}
    wrd = {i: [sys_strs[i]]}
    rouge, _ = rouge_obj.compute_score(wtd, wrd)

    rouges.append(rouge)

print(np.mean(rouges))

with open("%s-rouges.txt" % system, 'w') as outf:
    for r in rouges:
        outf.write(str(r) + '\n')

for i in range(len(ref1_strs)):
    word_target_dict[i] = [ref1_strs[i], ref2_strs[i]]
    word_response_dict[i] = [sys_strs[i]]

bleu_score, bleu_scores = bleu_obj.compute_score(word_target_dict,
                                                 word_response_dict)
Beispiel #14
0
def coco_caption_metrics_hier(predicts_list,
                              sentences_list,
                              image_id_list,
                              config,
                              batch_size=26,
                              is_training=True):
    with open(config.vocabulary_path, 'r') as file:
        vocabulary_list = json.load(file)
    word2id = {}
    for i in range(vocabulary_list.__len__()):
        word2id[vocabulary_list[i]] = i
    id2word = {v: k for k, v in word2id.items()}

    gts = {}
    res = {}
    for i in range(0, predicts_list.__len__()):
        for j in range(0, batch_size):
            sent_pre, sent_gt = [], []
            for k in range(config.max_sentence_num *
                           config.max_sentence_length):
                id_input = int(predicts_list[i][k][j])
                sent_pre.append(id2word[id_input])

                id_gt = sentences_list[i][j][k]
                if (not id2word[id_gt].__eq__('</S>')) and (
                        not id2word[id_gt].__eq__('<EOS>')):
                    sent_gt.append(id2word[id_gt])

            # sent_pre2 = sent_pre
            sent_pre2 = []
            for n in range(config.max_sentence_num):
                for m in range(config.max_sentence_length):
                    word = sent_pre[n * config.max_sentence_length + m]
                    if word != '</S>':
                        sent_pre2.append(word)
                    else:
                        break

            str_pre, str_gt = ' '.join(sent_pre2), ' '.join(sent_gt)
            image_id = image_id_list[i][j][0]
            gts[str(image_id)] = [str_gt]
            res[str(image_id)] = [str_pre]

    if not is_training:
        with open(config.result_gts_path, 'w') as file:
            json.dump(gts, file)
        with open(config.result_res_path, 'w') as file:
            json.dump(res, file)

    bleu_scorer = Bleu(n=4)
    bleu, _ = bleu_scorer.compute_score(gts=gts, res=res)

    rouge_scorer = Rouge()
    rouge, _ = rouge_scorer.compute_score(gts=gts, res=res)

    cider_scorer = Cider()
    cider, _ = cider_scorer.compute_score(gts=gts, res=res)
    # #
    # meteor_scorer = Meteor()
    # meteor, _ = meteor_scorer.compute_score(gts=gts, res=res)

    for i in range(4):
        bleu[i] = round(bleu[i], 4)

    # return bleu, round(meteor, 4), round(rouge, 4), round(cider, 4)
    return bleu, round(rouge, 4), round(cider, 4)
def run_load_gap_filler(pretrained_filename,
                        do_bleu=False,
                        must_have_anp=False,
                        copy_if_no_anp=False,
                        replace_adj=False,
                        get_human=False,
                        semi_human=False):
    rnn = RNNModel()
    rnn.load_model(pretrained_filename)
    rnn.conf['VAL_SPLIT'] = RNNDataProvider.TEST

    if get_human:
        id_to_caps = pickle.load(open("coco_mturk/id_to_caps.pik", "rb"))

    rnn.build_model_core()
    rnn.load_val_dataset()

    rnn.build_sentence_generator()

    rnn.build_perplexity_calculator()
    #print rnn.sample_sentence(rnn.V_valid[0])
    #print decoder_beamsearch2(rnn, rnn.V_valid[0])
    #print decoder_beamsearch(rnn, rnn.V_valid[0])

    #calculate_metric(rnn)
    #sys.exit(0)

    pos_sentence_res = []
    pos_att_res = []

    des_sentence_res = []
    des_att_res = []

    img_files = []
    img_ids = []

    id_to_sentences = {}

    seen_ids = set()
    if 'added_words' in rnn.conf:
        new_words = set([w[0] for w in rnn.conf['added_words']])
    else:
        new_words = set()
    num_ignore = 0
    num_not_ignore = 0
    for idx in range(rnn.V_valid.shape[0]):
        img_file = rnn.dp.img_id_to_filename[rnn.Id_valid[idx]]
        img_id = rnn.Id_valid[idx]
        if img_id not in id_to_sentences: id_to_sentences[img_id] = []
        #id_to_sentences[img_id].append(' '.join([rnn.dp.i2w[w] for w in rnn.X_valid[idx] if w != 0][::-1]))
        if replace_adj:
            id_to_sentences[img_id] = [
                ' '.join(do_replace_adj(rnn.dp.tokens[i])[::-1])
                for i in rnn.dp.img_id_to_tokens[img_id]
            ]
        elif get_human:
            id_to_sentences[img_id] = [
                ' '.join(rnn.dp.tokens[i][::-1])
                for i in rnn.dp.img_id_to_tokens[img_id]
            ]
            np.random.shuffle(id_to_sentences[img_id])
            print(len(id_to_sentences[img_id]))
            human_sen_pos = id_to_sentences[img_id].pop()
            print(len(id_to_sentences[img_id]))
            if not id_to_sentences[img_id]: continue
        else:
            id_to_sentences[img_id] = [
                ' '.join(rnn.dp.tokens[i][::-1])
                for i in rnn.dp.img_id_to_tokens[img_id]
            ]
        #print id_to_sentences[img_id]
        if img_id in seen_ids: continue
        seen_ids.add(img_id)
        if get_human and not semi_human:
            pos_sen = human_sen_pos.split()[::-1]
            np.random.shuffle(id_to_caps[img_id])
            des_sen = id_to_caps[img_id][0][::-1]
        else:
            lp, pos_sen, pos_att = decoder_beamsearch_with_attention(
                rnn, rnn.V_valid[idx], senti=1.0, beam_size=5)
            lp, des_sen, des_att = decoder_beamsearch_with_attention(
                rnn, rnn.V_valid[idx], senti=-1.0, beam_size=5)
            pos_sen = pos_sen[:-1]
            des_sen = des_sen[:-1]
            #des_att = des_att[:-1]
            pos_att = pos_att[:-1]
        #pos_sen, pos_att = rnn.get_sentence(rnn.V_valid[idx], senti=np.array([1.0], dtype=theano.config.floatX))
        pos_att = np.array(pos_att)
        pos_att = pos_att.flatten()
        #des_att = np.array(des_att)
        #des_att = des_att.flatten()
        des_att = np.zeros((len(des_sen), ))
        #pos_att = np.zeros((len(pos_sen),))
        if must_have_anp:
            if not sentence_has_anp(pos_sen[::-1]):
                num_ignore += 1
                continue
            num_not_ignore += 1
        if copy_if_no_anp:
            if not sentence_has_anp(pos_sen[::-1]):
                pos_sen = des_sen
        if replace_adj:
            pos_sen = do_replace_adj(pos_sen[::-1])[::-1]
            des_sen = do_replace_adj(des_sen[::-1])[::-1]

        #des_sen, des_att = rnn.get_sentence(rnn.V_valid[idx], senti=np.array([-1.0], dtype=theano.config.floatX))
        new_pos_sen = []
        for vv, a in zip(pos_sen, pos_att):
            out = vv
            col = ""
            if a > 0.75:
                col = "#FF3300"
            elif a > 0.5:
                col = "#FF5C33"
            elif a > 0.25:
                col = "#FF8566"
            #if a > 0.75:
            #    col = "#33CC33"# "#3366FF"
            #elif a > 0.5:
            #    col = "#70DB70" #"#5C85FF"
            #elif a > 0.25:
            #    col = "#ADEBAD" #"#85A3FF"
            if col:
                out = "<font style='background-color: %s'>%s</font>" % (col,
                                                                        vv)
            new_pos_sen.append(out)
        pos_sen = new_pos_sen
        print(pos_sen)
        print(pos_att)
        print(des_sen)
        print_it = False
        for v in pos_sen:
            if v in new_words:
                print_it = True
        if print_it:
            for x in zip(pos_sen, pos_att)[::-1]:
                print(x[0], end=' ')
            print("")
        #for x in zip(pos_sen, pos_att)[::-1]:
        #    print x[0],
        #print ""
        #for x in zip(des_sen, des_att)[::-1]:
        #    print x[0],
        #print "\n"
        pos_att = pos_att[:len(pos_sen)]
        des_att = des_att[:len(des_sen)]
        pos_sentence_res.append(pos_sen[::-1])
        pos_att_res.append(np.exp(pos_att[::-1]))
        des_sentence_res.append(des_sen[::-1])
        des_att_res.append(np.exp(des_att[::-1]))
        img_files.append(img_file)
        img_ids.append(img_id)

    output = {
        'pos_sen': pos_sentence_res,
        'pos_att': pos_att_res,
        'des_sen': des_sentence_res,
        'des_att': des_att_res,
        'img_files': img_files,
        'img_ids': img_ids
    }
    pickle.dump(output,
                open("output_data/sen_att_pos_01.pik", "wb"),
                protocol=2)

    if must_have_anp:
        print("Must have ANP % removed:",
              num_ignore / float(num_not_ignore) * 100.0)

    print("getting Positive perplexity")
    print(rnn.get_val_perplexity())
    print("got perplexity")

    print("getting Descriptive perplexity")
    print(rnn.get_val_perplexity(base=True))
    print("got perplexity")

    gts = {}
    res = {}
    fout = open("eval/output_pos", "w")
    for line, iid in zip(pos_sentence_res, img_ids):
        fout.write(' '.join(line) + '\n')
        if iid not in res: res[iid] = []
        res[iid].append(' '.join(line))
    fout.close()

    res_des = {}
    fout = open("eval/output_des", "w")
    for line, iid in zip(des_sentence_res, img_ids):
        fout.write(' '.join(line) + '\n')
        if iid not in res_des: res_des[iid] = []
        res_des[iid].append(' '.join(line))
    fout.close()

    for i in range(3):
        fout = open("eval/reference%d" % i, "w")
        for cid in img_ids:
            if cid not in gts: gts[cid] = []
            if len(id_to_sentences[cid]) > i:
                gts[cid].append(id_to_sentences[cid][i])
                fout.write(id_to_sentences[cid][i] + "\n")
            else:
                fout.write("\n")
        fout.close()

    bleu = Bleu()
    #for i in gts.keys()[:10]:
    #    print gts[i]
    #    print res_des[i]
    #    print res[i]
    #    print ""
    total_ref_sentences = 0
    for i in list(gts.keys()):
        total_ref_sentences += len(gts[i])
    print("Total ref sentences:", total_ref_sentences)
    print("Bleu:")
    print("Positive:", bleu.compute_score(gts, res)[0])
    print("Descriptive:", bleu.compute_score(gts, res_des)[0])
    rouge = Rouge()
    print("Rouge:")
    print("Positive:", rouge.compute_score(gts, res)[0])
    print("Descriptive:", rouge.compute_score(gts, res_des)[0])
    cider = Cider()
    print("Cider:")
    print("Positive:", cider.compute_score(gts, res)[0])
    print("Descriptive:", cider.compute_score(gts, res_des)[0])
    meteor = Meteor()
    print("Meteor:")
    print("Positive:", meteor.compute_score(gts, res)[0])
    print("Descriptive:", meteor.compute_score(gts, res_des)[0])
Beispiel #16
0
class SentenceEvaluator(object):
    def __init__(self):
        self.gt = {}
        self.gen = {}
        self.count = 0
        self.bleu = Bleu()
        self.rouge = Rouge()
        self.rb = pyrb.Readability(syllable_counter=pyrb.CMUDictCounter())
        #self.meteor = Meteor()
        #self.cider = Cider()

    def add_sentence_pair(self, generated, ground_truth):
        if not isinstance(generated, str):
            print "ERROR:", generated
            print type(generated)
        assert isinstance(generated, str)
        assert isinstance(ground_truth, str)

        self.gt[self.count] = [ground_truth]
        self.gen[self.count] = [generated]
        self.count += 1

    def add_pairs(self, generated, ground_truth):
        assert len(generated) == len(ground_truth)

        for gen, gt in zip(generated, ground_truth):
            self.add_sentence_pair(gen, gt)

    def clear(self):
        self.gt = {}
        self.gen = {}
        self.count = 0

    def edit_distance(self):
        ed = EditDistance()

        total_dist = 0
        total_norm_dist = 0
        op_count = {'m': 0, 'i': 0, 'd': 0, 'r': 0}
        op_count_norm = {'m': 0, 'i': 0, 'd': 0, 'r': 0}
        num_examples = len(self.gt)
        num_examples = max(num_examples, 1)
        for i in self.gt.keys():
            gt = self.gt[i][0].split()
            gen = self.gen[i][0].split()

            max_len = float(max(len(gt), len(gen)))
            max_len = max(max_len, 1.0)
            dist = ed.compute(gt, gen)
            total_dist += dist
            total_norm_dist += dist / max_len

            ops = ed.operations()
            for op in ops:
                op_count[op] += 1
                op_count_norm[op] += 1.0 / max_len

        mean_dist = total_dist / float(num_examples)
        mean_norm_dist = total_norm_dist / float(num_examples)

        for op in op_count:
            op_count[op] /= float(num_examples)
            op_count_norm[op] /= float(num_examples)

        return mean_dist, mean_norm_dist, op_count, op_count_norm

    def bleu_score(self):
        score, scores = self.bleu.compute_score(self.gt, self.gen)
        return score

    def bleu_scores(self):
        score, scores = self.bleu.compute_score(self.gt, self.gen)
        return np.array(scores).T

    def rouge_score(self):
        return self.rouge.compute_score(self.gt, self.gen)[0]

    def meteor_score(self):
        return self.meteor.compute_score(self.gt, self.gen)[0]

    def cider_score(self):
        return self.cider.compute_score(self.gt, self.gen)[0]

    def _get_words_per_sequence(self, lst):
        lens = [len(a[0].split()) for a in lst]
        return np.array(lens, dtype=np.int32)

    def _get_words_per_sentence(self, lst):
        lens = []
        for a in lst:
            for s in nltk.sent_tokenize(a[0]):
                lens.append(len(s.split()))
        return np.array(lens, dtype=np.int32)

    def mean_words_per_sentence_gt(self):
        return np.mean(self._get_words_per_sentence(self.gt.values()))

    def mean_words_per_sentence_gen(self):
        return np.mean(self._get_words_per_sentence(self.gen.values()))

    def mean_words_per_sentence_diff(self):
        gt_wps = self._get_words_per_sequence(self.gt.values())
        gen_wps = self._get_words_per_sequence(self.gen.values())
        return np.mean(gt_wps - gen_wps)

    def _get_sentence_list(self, sent_map):
        text = []
        for sent in sent_map.values():
            text.append(sent[0])
        return text

    def _get_sentence_list_gt(self):
        return self._get_sentence_list(self.gt)

    def _get_sentence_list_gen(self):
        return self._get_sentence_list(self.gen)

    def _text_stats_str(self, sentences):
        text = []
        for sent in sentences:
            sent_strip = sent.strip()
            if len(sent_strip) == 0 or sent_strip[-1] != '.':
                text.append(sent_strip + '.')
            else:
                text.append(sent_strip)
        text = " ".join(text)
        stat_str = ""
        try:
            fre = self.rb.flesch_kincaid_reading_ease(text)
            stat_str += "Flesch reading ease: %s\n" % str(fre)
            #si = textstat.smog_index(text)
            #stat_str += "Smog index: %s\n" % str(si)
            fkg = self.rb.flesch_kincaid_grade_level(text)
            stat_str += "Flesch kincaid grade: %s\n" % str(fkg)
            cli = self.rb.coleman_liau_index(text)
            stat_str += "Coleman liau index: %s\n" % str(cli)
            ari = self.rb.automated_readability_index(text)
            stat_str += "Automated redability index: %s\n" % str(ari)
            dcrs = self.rb.dale_chall_readability(text)
            stat_str += "Dale chall readability score: %s\n" % str(dcrs)
            #lwf = textstat.linsear_write_formula(text)
            #stat_str += "Linsear write formula: %s\n" % str(lwf)
            #gf = textstat.gunning_fog(text)
            #stat_str += "Gunning fog: %s\n" % str(gf)
        except Exception as e:
            stat_str += "Text quality is poor: caused an exeption during evaluation."
            print e

        return stat_str

    def __repr__(self):
        #for i in self.gt:
        #    print self.gt[i]
        #    print self.gen[i]
        #    print ""
        bleu = self.bleu_score()
        rouge = self.rouge_score()
        #meteor = self.meteor_score()
        #cider = self.cider_score()

        rep = "Evaluation Results (%d pairs):\n" % len(self.gt)
        rep += "Bleu: %s\n" % str(bleu)
        rep += "Rouge: %s\n" % str(rouge)
        #rep += "Meteor: %s\n" % str(meteor)
        #rep += "Cider: %s\n" % str(cider)

        words_per_sentence_gt = self.mean_words_per_sentence_gt()
        rep += "Mean words per sentence ground-truth: %f\n" % words_per_sentence_gt
        words_per_sentence_gen = self.mean_words_per_sentence_gen()
        rep += "Mean words per sentence generated: %f\n" % words_per_sentence_gen
        words_per_sentence_diff = self.mean_words_per_sentence_diff()
        rep += "Mean words per sentence diff 'mean(|gt| - |gen|)': %f\n" % words_per_sentence_diff

        rep += "--------Generated Readability Stats:--------\n"
        rep += self._text_stats_str(self._get_sentence_list_gen())
        rep += "--------Ground Truth Readability Stats:--------\n"
        rep += self._text_stats_str(self._get_sentence_list_gt())
        return rep

    def print_edit_distance(self):
        mean_dist, mean_norm_dist, op_average, op_average_norm = self.edit_distance(
        )
        print "EditDistance Stats:"
        print "mean_dist:", mean_dist
        print "mean_norm_dist", mean_norm_dist
        print "op_average:", op_average
        print "op_average_norm:", op_average_norm