def eval_meteor(references, preds, best_match=False):
    if best_match:
        meteor_scores = []
        for refs, pred in zip(references, preds):
            instance_scores = [meteor_score([ref], pred) for ref in refs]
            meteor_scores.append(max(instance_scores))
    else:
        meteor_scores = [meteor_score(inst[0], inst[1]) for inst in zip(references, preds)]
        
    return round(sum(meteor_scores)/len(meteor_scores),3), meteor_scores
Ejemplo n.º 2
0
def calculate_metric(hyp, ref, context, effective_length=1024):
    # ===== Calculate rouge ========
    with open('../result/rouge.txt', 'a') as f_result:
        rouge = Rouge()
        print(len(hyp))
        print(len(ref))
        hyp, ref = zip(*[(x, y) for x, y in zip(hyp, ref)
                         if len(x) > 3 and len(y) > 3])
        print(len(hyp))
        hyp = [x[:effective_length] for x in hyp]
        ref = [x[:effective_length] for x in ref]
        scores = rouge.get_scores(hyp, ref, avg=True)
        print("ROUGE", scores)
        import time
        f_result.write(time.asctime() + '\n')
        f_result.write(args.model_dir + '\t' + str(effective_length) + '\n')
        f_result.write(str(scores))
        f_result.write('\n')
    # ====== Calculate Meteor =========
    print("#ref{} #hyp{}".format(len(ref), len(hyp)))
    meteor_sum = 0
    for i in range(min(len(ref), len(hyp))):
        meteor_sum += meteor_score([ref[i]], hyp[i])

    meteor_sum /= min(len(ref), len(hyp))
    print(meteor_sum)
Ejemplo n.º 3
0
def print_out_bleu_and_meteor_score(predicted_path, expected_path):

    scores = [('BLEU SCORE-1: ', []), ('BLEU SCORE-2: ', []),
              ('BLEU SCORE-3: ', []), ('BLEU SCORE-4: ', []),
              ('METEOR SCORE: ', [])]

    with open(predicted_path, 'r') as fp_pred, open(expected_path,
                                                    'r') as fp_exp:
        for prediction, expected in tzip(fp_pred, fp_exp):
            prediction = prediction.split(' ')
            expected_list = expected.split(' ')

            scores[0][1].append(
                sentence_bleu(prediction, expected_list, weights=(1, 0, 0, 0)))
            scores[1][1].append(
                sentence_bleu(prediction, expected_list, weights=(0, 1, 0, 0)))
            scores[2][1].append(
                sentence_bleu(prediction, expected_list, weights=(0, 0, 1, 0)))
            scores[3][1].append(
                sentence_bleu(prediction, expected_list, weights=(0, 0, 0, 1)))
            scores[4][1].append(meteor_score(prediction, expected))

    for score in scores:
        print(score[0] + str(sum(score[1]) / len(score[1])))

    return 0
Ejemplo n.º 4
0
def print_metrics(model, device, dataset, dataloader):
    references, hypotheses = get_references_and_hypotheses(
        model, device, dataset, dataloader)

    # bleu scores
    bleu_1 = corpus_bleu(references, hypotheses, weights=(1, 0, 0, 0))
    bleu_2 = corpus_bleu(references, hypotheses, weights=(0.5, 0.5, 0, 0))
    bleu_3 = corpus_bleu(references, hypotheses, weights=(0.33, 0.33, 0.33, 0))
    bleu_4 = corpus_bleu(references, hypotheses)

    print('BLEU-1 ({})\t'
          'BLEU-2 ({})\t'
          'BLEU-3 ({})\t'
          'BLEU-4 ({})\t'.format(bleu_1, bleu_2, bleu_3, bleu_4))

    # meteor score
    total_m_score = 0.0

    for i in range(len(references)):
        actual = [" ".join(ref) for ref in references[i]]
        total_m_score += meteor_score(actual, " ".join(hypotheses[i]))

    m_score = total_m_score / len(references)

    print('Meteor Score: {}'.format(m_score))

    metrics = {
        'bleu_1': bleu_1,
        'bleu_2': bleu_2,
        'bleu_3': bleu_3,
        'bleu_4': bleu_4,
        'meteor': m_score
    }

    return metrics
Ejemplo n.º 5
0
 def compute_score(self, candidate: str, references: List[str]) -> Tensor:
     score = meteor_score(references,
                          candidate,
                          alpha=self.alpha,
                          beta=self.beta,
                          gamma=self.gamma)
     return torch.scalar_tensor(score)
Ejemplo n.º 6
0
def getBLUEAndMEteroScores(sumDoc, refDoc, tClean):
    #refDoc = tClean.getSentTokenization(refDoc)
    BLUE = sentence_bleu(refDoc, sumDoc)
    MEtero = meteor_score(refDoc, sumDoc)
    #
    #
    return "{}\t{}".format(tClean.toRound(BLUE), tClean.toRound(MEtero))
Ejemplo n.º 7
0
def calculate_m_score(target, predictions, length):

    score = 0

    for t, p in zip(target, predictions):
        score += meteor_score(t, p)

    return score / length
Ejemplo n.º 8
0
def _get_sent_meteor(
        hypothesis: List[str], references: List[List[str]],
        extra_args: Optional[Dict[str, str]] = None
) -> List[float]:
    joined_references = list(zip(*references))
    return [
        meteor_score(r, h) for r, h in zip(joined_references, hypothesis)
    ]
Ejemplo n.º 9
0
def calculate_meteor(results):
    meteor_scores = []
    for key in results:
        references = results[key][1]
        hypothesis = results[key][0]
        score = meteor_score([' '.join(reference) for reference in references],
                             ' '.join(hypothesis))
        meteor_scores.append(score)
    return statistics.mean(meteor_scores), statistics.stdev(meteor_scores)
Ejemplo n.º 10
0
 def compute_(self, **kwargs):
     question_decoded = self.dataset.question_tokenizer.decode(
         kwargs["state"].text.numpy()[0],
         ignored=["<SOS>"],
         stop_at_end=True)
     ref_questions = kwargs["ref_questions_decoded"]
     score = meteor_score(references=ref_questions,
                          hypothesis=question_decoded)
     self.metric.append(score)
def Metrics(file_loc, increment=4, embedding_dict=None):
    rouge_p = []
    rouge_r = []
    rouge_f = []
    bleu = []
    fp = open(file_loc)
    D = fp.readlines()
    r_pre = 0.0
    r_rec = 0.0
    r_f1 = 0.0
    bert = 0.0
    sent_bleu = 0.0
    meteor_s = 0.0
    cnt_ = 0
    i = 0
    while i < len(D):
        tar = D[i + 2].split()[1:]
        mod = D[i + 1].split()[1:]

        if '<eor>' in tar:
            ind_tar = tar.index('<eor>')
        else:
            ind_tar = -1
        if '<eor>' in mod:
            ind_mod = mod.index('<eor>')
        else:
            ind_mod = -1
        tar_embs = []
        mod_embs = []
        for word in tar[:ind_tar]:
            if word in embedding_dict:
                tar_embs += [embedding_dict[word]]
        tar_embs = np.stack(tar_embs)
        for word in mod[:ind_mod]:
            if word in embedding_dict:
                mod_embs += [embedding_dict[word]]
        mod_embs = np.stack(mod_embs)
        tar_emb = np.sum(tar_embs, axis=0)
        mod_emb = np.sum(mod_embs, axis=0)
        bert -= np.mean((tar_emb - mod_emb)**2)
        r_scores = R.get_scores(' '.join(mod[:ind_mod]),
                                ' '.join(tar[:ind_tar]))
        sent_bleu += bleu_met([mod[:ind_mod]], tar[:ind_tar], (0.5, 0.5))
        meteor_s += meteor_score([' '.join(mod[:ind_mod])],
                                 ' '.join(tar[:ind_tar]))
        r_pre += r_scores[0]['rouge-l']['p']
        r_rec += r_scores[0]['rouge-l']['r']
        r_f1 += r_scores[0]['rouge-l']['f']
        i += increment
        cnt_ += 1
    return {
        'METEOR': meteor_s / float(cnt_),
        'BLEU': sent_bleu / float(cnt_),
        'F1': r_f1 / float(cnt_),
        'BERT': bert / float(cnt_)
    }
Ejemplo n.º 12
0
def get_meteor_score(result_list):

    total_meteor = 0
    for line in result_list:
        single_reference = [line[0], line[1]]
        # reference_list.append(single_reference)
        # candidate_list.append(line[2])
        score = meteor_score(single_reference, line[2], wordnet=wordnet)
        total_meteor += score
    print("meteor_score: ", total_meteor / len(result_list))
Ejemplo n.º 13
0
def get_meteor_score(hypothesis: List[List[str]],
                     reference: List[str]) -> list:
    meteor_score_list = []
    for (hyp, ref) in list(zip(hypothesis, reference)):
        try:
            m_score = meteor_score(hyp, ref)
            meteor_score_list.append(m_score)
        except:
            continue

    return meteor_score_list
Ejemplo n.º 14
0
def _compute_meteor(reference,predict):
	"""Fun:compute meteor score
		Input: sentence with string, e.g: reference= "I have a car"
		For meteor_score: [reference1, reference2, reference3]
		input: predict is the string of sentence
	"""

	meteor = meteor_score(reference,predict)
	#### single mean one to one
	# meteor = single_meteor_score(reference,predict)
	return meteor
Ejemplo n.º 15
0
def corpus_meteor(list_of_refs, list_of_hypos):
    # the original input format of Meteor metric is different form BLEU series
    # in this function, we change the format of BLEU to fit Meteor
    Meteor = 0.0

    for i, ref in enumerate(list_of_refs):
        ref_list_tmp = [' '.join(intlist2strlist(val)) for val in ref]
        hypo_tmp = ' '.join(intlist2strlist(list_of_hypos[i]))
        Meteor += meteor_score(ref_list_tmp, hypo_tmp)

    return Meteor / (len(list_of_hypos))
    def compute(self, hypotheses, references):

        try:
            nltk.data.find('wordnet')
        except LookupError:
            nltk.download('wordnet')

        return sum([
            meteor_score([ref], hyp)
            for (ref, hyp) in zip(hypotheses, references)
        ]) / len(references)
Ejemplo n.º 17
0
	def forward(self, hypothesis: List[List[str]], references: List[List[List[str]]]) -> float:
		if len(hypothesis) != len(references):
			raise ValueError(f'Batch size of hypothesis and references are different ({len(hypothesis)} != {len(references)}).')

		batch_scores = []
		for hyp, refs in zip(hypothesis, references):
			hyp = ' '.join(hyp)
			refs = [' '.join(ref) for ref in refs]
			score = meteor_score(hypothesis=hyp, references=refs, alpha=self.alpha, beta=self.beta, gamma=self.gamma)
			batch_scores.append(score)

		return torch.mean(torch.as_tensor(batch_scores)).item()
Ejemplo n.º 18
0
    def test_preprocess(self):
        # Using lists instead of strings specifically to demonstrate use of `preprocess`.
        reference = [["this", "is", "a", "test"], ["this", "is" "test"]]
        candidate = ["this", "is", "a", "test"]

        # no `preprocess` argument
        self.assertRaises(TypeError, meteor_score, reference, candidate)

        # with `preprocess` argument
        score = meteor_score(reference,
                             candidate,
                             preprocess=lambda x: " ".join(x))
        assert score == 0.9921875
Ejemplo n.º 19
0
def corpus_meteor(references, hypotheses):
    """ The original input format of Meteor metric is different form BLEU series.
        In this function, we change the format of BLEU to fit Meteor.
    """
    def to_str(values):
        return [str(val) for val in values]

    Meteor = 0.0
    for gt_group, pred in zip(references, hypotheses):
        gt = [' '.join(to_str(val)) for val in gt_group]
        pred = ' '.join(to_str(pred))
        Meteor += meteor_score(gt, pred)
    return Meteor / (len(references))
def METEOR(image_names, captions, encoder, decoder, range1=6000, range2=7000):
  scoreList4 = []
  for i in range(range1, range2):
    image_path = image_names[i]
    cap = captions[image_path][0].split()
    cap1 = captions[image_path][1].split()
    cap2 = captions[image_path][2].split()
    cap3 = captions[image_path][3].split()
    cap4 = captions[image_path][4].split()
    result, attention_plot = evaluate(image_path, encoder, decoder)
    result = result[:-1]
    s=""
    for m in result:
      s +=m + ' '
    c=""
    for m in cap:
      c +=m + ' '
    c1=""
    for m in cap1:
      c1 +=m + ' '
    c2=""
    for m in cap2:
      c2 +=m + ' '
    c3=""
    for m in cap3:
      c3 +=m + ' '
    c4=""
    for m in cap4:
      c4 +=m + ' '
    # print(s,c)
    score = meteor_score(s, c)
    score1 = meteor_score(s, c1)
    score2 = meteor_score(s, c2)
    score3= meteor_score(s, c3)
    score4 = meteor_score(s, c4)
    score= max(score,score1,score2,score3,score4)
    scoreList4.append(score)
  return (sum(scoreList4)*100)/len(scoreList4)
Ejemplo n.º 21
0
 def sim_meteor(self, hyps, ref):
     """
     :param refs - a list of strings representing references
     :param hyps - a list of tokens of the hypothesis
     :return maxbleu - recall bleu
     :return avgbleu - precision bleu
     """
     scores = []
     for hyp in hyps:
         #try:
         scores.append(meteor_score([ref], hyp))
         #except:
         #    scores.append(0.0)
     return np.max(scores), np.mean(scores)
def get_scores(model, loader, word_dict, idx_dict, device, debug):
    model.eval()
    references = []
    hypotheses = []
    for batch_idx, (imgs, captions, all_captions) in tqdm(enumerate(loader)):
        imgs, captions = Variable(imgs).to(device), Variable(captions).to(
            device)
        max_timespan = max([
            len(caption) for caption in captions
        ]) - 1  # -1, because assuming ke model already generated start token
        preds, alphas = model(imgs, max_timespan)

        for cap_set in all_captions.tolist():
            caps = []
            for caption in cap_set:
                cap = [
                    word_idx for word_idx in caption
                    if word_idx != word_dict['<start>']
                    and word_idx != word_dict['<pad>']
                ]
                caps.append(cap)
            references.append(caps)

        word_idxs = torch.max(preds, dim=2)[1]
        for idxs in word_idxs.tolist():
            hypotheses.append([
                idx for idx in idxs
                if idx != word_dict['<start>'] and idx != word_dict['<pad>']
            ])
        if debug:
            break

    bleu_1 = corpus_bleu(references, hypotheses, weights=(1, 0, 0, 0))
    bleu_2 = corpus_bleu(references, hypotheses, weights=(0.5, 0.5, 0, 0))
    bleu_3 = corpus_bleu(references, hypotheses, weights=(0.33, 0.33, 0.33, 0))
    bleu_4 = corpus_bleu(references, hypotheses)
    score = []
    for i in range(len(references)):
        references_i = []
        for j in references[i]:
            words = []
            for k in j:
                words.append(idx_dict[k])
            references_i.append(' '.join(words))
        hypo_i = []
        for j in hypotheses[i]:
            hypo_i.append(idx_dict[j])
        score.append(meteor_score.meteor_score(references_i, ' '.join(hypo_i)))
    return (bleu_1, bleu_2, bleu_3, bleu_4, np.mean(score))
Ejemplo n.º 23
0
def calculate_metrics(predict, reference):
    reference_len = len(reference)
    predict_len = len(predict)

    #-------------------bleu----------
    bleu_2 = bleu(predict, reference, 2)
    bleu_4 = bleu(predict, reference, 4)
    #-------------------nist----------
    nist_2 = nist(predict, reference, 2)
    nist_4 = nist(predict, reference, 4)
    #-------------------meteor----------
    predict = " ".join(predict)
    reference = " ".join(reference)
    meteor_scores = meteor_score([reference], predict)
    return bleu_2, bleu_4, nist_2, nist_4, meteor_scores
Ejemplo n.º 24
0
def get_metrics(pred, target):
    turns = len(target)
    bleu_2 = 0
    bleu_4 = 0
    meteor = 0
    nist_2 = 0
    nist_4 = 0
    for index in range(turns):
        pred_utt = pred[index]
        target_utt = target[index]
        min_len = min(len(pred_utt), len(target_utt))
        lens = min(min_len, 4)
        if lens == 0:
            continue
        if lens >= 4:
            bleu_4_utt = sentence_bleu(
                [target_utt],
                pred_utt,
                weights=(0.25, 0.25, 0.25, 0.25),
                smoothing_function=SmoothingFunction().method1)
            nist_4_utt = sentence_nist([target_utt], pred_utt, 4)
        else:
            bleu_4_utt = 0
            nist_4_utt = 0
        if lens >= 2:
            bleu_2_utt = sentence_bleu(
                [target_utt],
                pred_utt,
                weights=(0.5, 0.5),
                smoothing_function=SmoothingFunction().method1)
            nist_2_utt = sentence_nist([target_utt], pred_utt, 2)
        else:
            bleu_2_utt = 0
            nist_2_utt = 0

        bleu_2 += bleu_2_utt
        bleu_4 += bleu_4_utt
        meteor += meteor_score([" ".join(target_utt)], " ".join(pred_utt))
        nist_2 += nist_2_utt
        nist_4 += nist_4_utt

    bleu_2 /= turns
    bleu_4 /= turns
    meteor /= turns
    nist_2 /= turns
    nist_4 /= turns
    return bleu_2, bleu_4, meteor, nist_2, nist_4
Ejemplo n.º 25
0
def main():
    nltk.data.path.append('/data/chuancen/pip_package/nltk_data')
    print(nltk.__version__)
    file_handler = open('../../result/reference_SR_only.txt', 'r')
    ref = file_handler.readlines()
    file_handler = open('../../result/SR_only.txt', 'r')
    hyp = file_handler.readlines()

    print("#ref{} #hyp{}".format(len(ref), len(hyp)))
    meteor_sum = 0
    for i in range(min(len(ref), len(hyp))):
        meteor_sum += meteor_score([ref[i]], hyp[i])

    meteor_sum /= min(len(ref), len(hyp))
    print(meteor_sum)

    tokenizer = GPT2Tokenizer.from_pretrained(
        '/data/chuancen/LIT/models/345M_Alex')
Ejemplo n.º 26
0
    def compute_score(self, gts, res):
        assert(gts.keys() == res.keys())
        imgIds = gts.keys()
        scores = []

        for i in imgIds:
            assert(len(res[i]) == 1)
            score = round(meteor_score(gts[i], res[i][0]), 4)
            scores.append(score)
        #print('{}\n'.format(eval_line))
        #self.meteor_p.stdin.write('{}\n'.format(eval_line))
        #print(self.meteor_p.stdout.readline().strip())

        #for i in range(0,len(imgIds)):
        #    scores.append(float(self.meteor_p.stdout.readline().strip()))
        #score = float(self.meteor_p.stdout.readline().strip())
        #self.lock.release()

        return sum(scores)/len(scores), scores
Ejemplo n.º 27
0
    def _calc_metrics_info(self, generate_corpus, reference_corpus):
        generate_corpus = [
            self._preprocess(generate_sentence)
            for generate_sentence in generate_corpus
        ]
        reference_corpus = [
            self._preprocess(reference_sentence)
            for reference_sentence in reference_corpus
        ]
        reference_corpus = [[reference_sentence]
                            for reference_sentence in reference_corpus]

        result = {}
        scores = []
        for gen, refs in zip(generate_corpus, reference_corpus):
            score = meteor_score(refs, gen)
            scores.append(score)

        result['meteor'] = scores
        return result
Ejemplo n.º 28
0
def test(encoder, decoder, dataloader):
    score = 0
    n = 0
    for j, batch in tqdm(enumerate(dataloader)):
        input_tensor, target_tensor = batch
        if torch.cuda.is_available():
            target_tensor = target_tensor.cuda()
            input_tensor = input_tensor.cuda()
        a, b = predict(encoder, decoder, input_tensor, target_tensor)
        try:
            score += meteor_score(a, b)
            n += 1
        except Exception:
            pass
        input_text = []
        for i in range(input_tensor.shape[1]):
            input_text.append(num2code[input_tensor[:, i].view(1).item()])
            if input_text[-1] == 'PAD':
                break
        print(' '.join(input_text[1:-1]))
        print(a)
    score /= n
    print('METEOR: {}'.format(score))
def get_rouge_meteor_from_output(s_pred, s_true, reverse_word_map, order, weights):
    r_score_tot = 0
    k_score_tot = 0
    for batch in range(s_pred.shape[0]):
        sentence = ''
        sentence_true = ['']
        for word in range(s_pred.shape[2]):

            encoded_word = reverse_word_map.get(np.argmax(s_pred[batch, :, word]))
            if encoded_word:
                sentence +=' '+ encoded_word

            true_word = reverse_word_map.get(s_true[batch, word])
            if true_word:
                sentence_true[0] += " "+ true_word
        r_score = rouge.get_scores(sentence_true[0], sentence)
        k_score = meteor_score(sentence_true, sentence, 4)

        r_score = r_score[0]['rouge-1']['f']
        r_score_tot += r_score/s_pred.shape[0]
        k_score_tot += k_score/s_pred.shape[0]

    return r_score_tot, k_score_tot
Ejemplo n.º 30
0
 def test_meteor(self):
     score = meteor_score(self.reference, self.candidate, preprocess=str.lower)
     assert score == 0.9921875