def evaluate_narrative_qa(ground_truth, predicted_answers): """Evaluation NarrativeQA predictions.""" scorers = [(Bleu(4), ['Bleu_1', 'Bleu_2', 'Bleu_3', 'Bleu_4']), (Rouge(), 'ROUGE_L'), (Cider(), 'CIDEr')] def preprocess(text): return text.lower().rstrip(' .').strip() common_keys = [k for k in predicted_answers if k in ground_truth] refs = {k: [preprocess(s) for s in ground_truth[k]] for k in common_keys} hyps = {k: [preprocess(predicted_answers[k])] for k in common_keys} ret_scores = dict(common=len(common_keys)) for scorer, method in scorers: score, scores = scorer.compute_score(refs, hyps) if isinstance(method, list): for sc, _, m in zip(score, scores, method): # print('%s: %0.6f' % (m, sc)) ret_scores[m] = sc * 100 else: # print('%s: %0.6f' % (method, score)) ret_scores[method] = score * 100 if isinstance(scorer, Meteor): scorer.close() del scorers return ret_scores
def compute_individual_metrics(ref, hyp, no_overlap=False, no_skipthoughts=True, no_glove=False): assert isinstance(hyp, six.string_types) if isinstance(ref, six.string_types): ref = ref.split( '||<|>||') # special delimiter for backward compatibility ref = [a.strip() for a in ref] refs = {0: ref} ref_list = [ref] hyps = {0: [hyp.strip()]} hyp_list = [hyp] ret_scores = {} if not no_overlap: scorers = [(Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]), (Meteor(), "METEOR"), (Rouge(), "ROUGE_L"), (Cider(), "CIDEr")] for scorer, method in scorers: score, scores = scorer.compute_score(refs, hyps) if isinstance(method, list): for sc, scs, m in zip(score, scores, method): ret_scores[m] = sc else: ret_scores[method] = score return ret_scores
def compute_individual_metrics(ref, hyp, no_overlap=False, no_skipthoughts=False, no_glove=False): assert isinstance(hyp, six.string_types) if isinstance(ref, six.string_types): ref = ref.split('||<|>||') # special delimiter for backward compatibility ref = [a.strip() for a in ref] refs = {0: ref} ref_list = [ref] hyps = {0: [hyp.strip()]} hyp_list = [hyp] ret_scores = {} if not no_overlap: scorers = [ (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]), (Meteor(), "METEOR"), (Rouge(), "ROUGE_L"), (Cider(), "CIDEr") ] for scorer, method in scorers: score, scores = scorer.compute_score(refs, hyps) if isinstance(method, list): for sc, scs, m in zip(score, scores, method): ret_scores[m] = sc else: ret_scores[method] = score if isinstance(scorer, Meteor): scorer.close() del scorers if not no_skipthoughts: from nlgeval.skipthoughts import skipthoughts import numpy as np from sklearn.metrics.pairwise import cosine_similarity model = skipthoughts.load_model() encoder = skipthoughts.Encoder(model) vector_hyps = encoder.encode([h.strip() for h in hyp_list], verbose=False) ref_list_T = np.array(ref_list).T.tolist() vector_refs = map(lambda refl: encoder.encode([r.strip() for r in refl], verbose=False), ref_list_T) cosine_similarity = list(map(lambda refv: cosine_similarity(refv, vector_hyps).diagonal(), vector_refs)) cosine_similarity = np.max(cosine_similarity, axis=0).mean() ret_scores['SkipThoughtCS'] = cosine_similarity if not no_glove: from nlgeval.word2vec.evaluate import eval_emb_metrics import numpy as np glove_hyps = [h.strip() for h in hyp_list] ref_list_T = np.array(ref_list).T.tolist() glove_refs = map(lambda refl: [r.strip() for r in refl], ref_list_T) scores = eval_emb_metrics(glove_hyps, glove_refs) scores = scores.split('\n') for score in scores: name, value = score.split(':') value = float(value.strip()) ret_scores[name] = value return ret_scores
def compute_metrics(hypothesis, references, no_overlap=False, no_skipthoughts=True, no_glove=False): with open(hypothesis, 'r') as f: hyp_list = f.readlines() ref_list = [] for iidx, reference in enumerate(references): with open(reference, 'r') as f: ref_list.append(f.readlines()) ref_list = [list(map(_strip, refs)) for refs in zip(*ref_list)] refs = {idx: strippedlines for (idx, strippedlines) in enumerate(ref_list)} hyps = {idx: [lines.strip()] for (idx, lines) in enumerate(hyp_list)} assert len(refs) == len(hyps) ret_scores = {} if not no_overlap: scorers = [(Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]), (Meteor(), "METEOR"), (Rouge(), "ROUGE_L"), (Cider(), "CIDEr")] for scorer, method in scorers: score, scores = scorer.compute_score(refs, hyps) if isinstance(method, list): for sc, scs, m in zip(score, scores, method): print("%s: %0.6f" % (m, sc)) ret_scores[m] = sc else: print("%s: %0.6f" % (method, score)) ret_scores[method] = score del scorers return ret_scores
def load_scorers(self): self.scorers = [ (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]), (Meteor(), "METEOR"), (Rouge(), "ROUGE_L"), (Cider(), "CIDEr") ]
def load_scorers(self): self.scorers = [] omit_bleu_i = False for i in range(1, 4 + 1): if 'Bleu_{}'.format(i) in self.metrics_to_omit: omit_bleu_i = True if i > 1: self.scorers.append((Bleu(i - 1), ['Bleu_{}'.format(j) for j in range(1, i)])) break if not omit_bleu_i: self.scorers.append((Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"])) if 'ROUGE_L' not in self.metrics_to_omit: self.scorers.append((Rouge(), "ROUGE_L")) if 'CIDEr' not in self.metrics_to_omit: self.scorers.append((Cider(), "CIDEr"))
def compute_metrics(ref, hyp): # ref = ref.split('||<|>||') # special delimiter #ref = [a.strip() for a in ref] refs = {0: [ref]} #ref_list = [ref] hyps = {0: [hyp.strip()]} hyp_list = [hyp] ret_scores = {} scorers = [(Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]), (Meteor(), "METEOR"), (Rouge(), "ROUGE_L"), (Cider(), "CIDEr")] for scorer, method in scorers: score, scores = scorer.compute_score(refs, hyps) if isinstance(method, list): for sc, scs, m in zip(score, scores, method): ret_scores[m] = sc else: ret_scores[method] = score return ret_scores
def compute_metrics_all(references, hypothesises): refs = { idx: [strippedlines.strip()] for (idx, strippedlines) in enumerate(references) } hyps = {idx: [lines.strip()] for (idx, lines) in enumerate(hypothesises)} assert len(refs) == len(hyps) ret_scores = {} scorers = [(Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]), (Meteor(), "METEOR"), (Rouge(), "ROUGE_L"), (Cider(), "CIDEr")] for scorer, method in scorers: score, scores = scorer.compute_score(refs, hyps) if isinstance(method, list): for sc, scs, m in zip(score, scores, method): #print("%s: %0.6f" % (m, sc)) ret_scores[m] = sc else: #print("%s: %0.6f" % (method, score)) ret_scores[method] = score return ret_scores
def compute_metrics(gt_caps, pred_caps): assert len(gt_caps) == len(pred_caps) gt_caps = add_space_to_cap_dict(gt_caps) pred_caps = add_space_to_cap_dict(pred_caps) ret_scores = {} scorers = [(Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]), (Meteor(), "METEOR"), (Rouge(), "ROUGE_L"), (Cider(), "CIDEr")] for scorer, method in scorers: score, scores = scorer.compute_score(gt_caps, pred_caps) if isinstance(method, list): for sc, scs, m in zip(score, scores, method): print("%s: %0.6f" % (m, sc)) ret_scores[m] = sc else: print("%s: %0.6f" % (method, score)) ret_scores[method] = score if isinstance(scorer, Meteor): scorer.close() del scorers return ret_scores
def compute_metrics_by_file(references, hypothesis): """ Given a list of gold file names and a predict result file, calculate metrics. Same line number corresponds to the same instance to calculate metric. Ref: https://github.com/Maluuba/nlg-eval :param references: list of gold file names. :param hypothesis: predict file name. :return: a list of metric results. """ scorers = [(Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]), (Meteor(), "METEOR"), (Rouge(), "ROUGE_L"), (Cider(), "CIDEr")] def _strip(s): return s.strip() with open(hypothesis, encoding='utf-8') as f: hyp_list = f.readlines() ref_list = [] for iidx, reference in enumerate(references): with open(reference, encoding='utf-8') as f: ref_list.append(f.readlines()) ref_list = [list(map(_strip, refs)) for refs in zip(*ref_list)] refs = {idx: strippedlines for (idx, strippedlines) in enumerate(ref_list)} hyps = {idx: [lines.strip()] for (idx, lines) in enumerate(hyp_list)} assert len(refs) == len(hyps) ret_scores = {} for scorer, method in scorers: score, scores = scorer.compute_score(refs, hyps) if isinstance(method, list): for sc, scs, m in zip(score, scores, method): # print("%s: %0.6f" % (m, sc)) ret_scores[m] = sc else: # print("%s: %0.6f" % (method, score)) ret_scores[method] = score return ret_scores
def compute_metrics(hypothesis, references, no_overlap=False, no_skipthoughts=False, no_glove=False): with open(hypothesis, 'r') as f: hyp_list = f.readlines() ref_list = [] for iidx, reference in enumerate(references): with open(reference, 'r') as f: ref_list.append(f.readlines()) ref_list = [list(map(_strip, refs)) for refs in zip(*ref_list)] refs = {idx: strippedlines for (idx, strippedlines) in enumerate(ref_list)} hyps = {idx: [lines.strip()] for (idx, lines) in enumerate(hyp_list)} assert len(refs) == len(hyps) ret_scores = {} if not no_overlap: scorers = [(Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]), (Meteor(), "METEOR"), (Rouge(), "ROUGE_L"), (Cider(), "CIDEr")] for scorer, method in scorers: score, scores = scorer.compute_score(refs, hyps) if isinstance(method, list): for sc, scs, m in zip(score, scores, method): print("%s: %0.6f" % (m, sc)) ret_scores[m] = sc else: print("%s: %0.6f" % (method, score)) ret_scores[method] = score if isinstance(scorer, Meteor): scorer.close() del scorers if not no_skipthoughts: from nlgeval.skipthoughts import skipthoughts import numpy as np from sklearn.metrics.pairwise import cosine_similarity model = skipthoughts.load_model() encoder = skipthoughts.Encoder(model) vector_hyps = encoder.encode([h.strip() for h in hyp_list], verbose=False) ref_list_T = np.array(ref_list).T.tolist() vector_refs = map( lambda refl: encoder.encode([r.strip() for r in refl], verbose=False), ref_list_T) cosine_similarity = list( map(lambda refv: cosine_similarity(refv, vector_hyps).diagonal(), vector_refs)) cosine_similarity = np.max(cosine_similarity, axis=0).mean() print("SkipThoughtsCosineSimilarity: %0.6f" % (cosine_similarity)) ret_scores['SkipThoughtCS'] = cosine_similarity del model if not no_glove: from nlgeval.word2vec.evaluate import eval_emb_metrics import numpy as np glove_hyps = [h.strip() for h in hyp_list] ref_list_T = np.array(ref_list).T.tolist() glove_refs = map(lambda refl: [r.strip() for r in refl], ref_list_T) scores = eval_emb_metrics(glove_hyps, glove_refs) print(scores) scores = scores.split('\n') for score in scores: name, value = score.split(':') value = float(value.strip()) ret_scores[name] = value return ret_scores
def compute_metrics(hypothesis, references, no_overlap=False, no_skipthoughts=False, no_glove=False): with open(hypothesis, 'r') as f: hyp_list = f.readlines() ref_list = [] for iidx, reference in enumerate(references): with open(reference, 'r') as f: ref_list.append(f.readlines()) ref_list = [map(str.strip, refs) for refs in zip(*ref_list)] refs = {idx: strippedlines for (idx, strippedlines) in enumerate(ref_list)} hyps = {idx: [lines.strip()] for (idx, lines) in enumerate(hyp_list)} assert len(refs) == len(hyps) ret_scores = {} ret1_scores={} if not no_overlap: scorers = [ (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]), (Meteor(), "METEOR"), (Rouge(), "ROUGE_L"), (Cider(), "CIDEr") ] for scorer, method in scorers: score, scores = scorer.compute_score(refs, hyps) if isinstance(method, list): for sc, scs, m in zip(score, scores, method): # print("First print: %s:" %m) # print("%s: %0.6f" % (m, sc))//giving BLEu scores ret1_scores[m] = sc else: # print("Second print: %s: "%method) #print("%s: %0.6f" % (method, score))//gives meteor,rouge_l and cider ret_scores[method] = score #print(type(ret_scores)) # if not no_skipthoughts: # from nlgeval.skipthoughts import skipthoughts # import numpy as np # from sklearn.metrics.pairwise import cosine_similarity # model = skipthoughts.load_model() # encoder = skipthoughts.Encoder(model) # vector_hyps = encoder.encode([h.strip() for h in hyp_list], verbose=False) # ref_list_T = np.array(ref_list).T.tolist() # vector_refs = map(lambda refl: encoder.encode([r.strip() for r in refl], verbose=False), ref_list_T) # cosine_similarity = map(lambda refv: cosine_similarity(refv, vector_hyps).diagonal(), vector_refs) # cosine_similarity = np.max(cosine_similarity, axis=0).mean() # print("SkipThoughtsCosineSimilairty: %0.6f" % (cosine_similarity)) # ret_scores['SkipThoughtCS'] = cosine_similarity if not no_glove: from nlgeval.word2vec.evaluate import eval_emb_metrics import numpy as np glove_hyps = [h.strip() for h in hyp_list] ref_list_T = np.array(ref_list).T.tolist() glove_refs = map(lambda refl: [r.strip() for r in refl], ref_list_T) scores = eval_emb_metrics(glove_hyps, glove_refs) #print(scores) scores = scores.split('\n') for score in scores: name, value = score.split(':') value = float(value.strip()) ret_scores[name] = value # return ret_scores ret_scores["METEOR"]=ret_scores["METEOR"]*a ret_scores["ROUGE_L"]=ret_scores["ROUGE_L"]*b ret_scores["CIDEr"]=ret_scores["CIDEr"]*c ret_scores["EmbeddingAverageCosineSimilairty"]=ret_scores["EmbeddingAverageCosineSimilairty"]*d ret_scores["VectorExtremaCosineSimilarity"]=ret_scores["VectorExtremaCosineSimilarity"]*e # ret_scores["GreedyMatchingScore"]=ret_scores["GreedyMatchingScore"]*f sum=0 # for key in ret_scores: # sum=sum+ret_scores[key] sum=ret_scores["METEOR"]+ret_scores["ROUGE_L"]+ret_scores["CIDEr"]+ret_scores["EmbeddingAverageCosineSimilairty"]+ret_scores["VectorExtremaCosineSimilarity"] marks=sum*maximum_marks print("Marks: %0.2f" % marks)
def white_space_fix(text): return ' '.join(text.split()) def remove_punc(text): exclude = set(string.punctuation) return ''.join(ch for ch in text if ch not in exclude) def lower(text): return text.lower() return white_space_fix(remove_articles(remove_punc(lower(s)))) meteor_scorer = Meteor() rouge_scorer = Rouge() def ans_score(ans, gold_list): ans = normalize_answer(ans) gold_list = [normalize_answer(ref) for ref in gold_list] bleu = sentence_bleu([_.split() for _ in gold_list], ans.split(), weights=(1, 0, 0, 0)) meteor, _ = meteor_scorer.compute_score({0: gold_list}, {0: [ans]}) rouge, _ = rouge_scorer.compute_score({0: gold_list}, {0: [ans]}) return {'bleu': bleu, 'meteor': meteor, 'rouge': rouge} def evaluate(test_annotation_file, user_annotation_file, phase_codename, **kwargs):