Esempio n. 1
0
def evaluate_narrative_qa(ground_truth, predicted_answers):
  """Evaluation NarrativeQA predictions."""
  scorers = [(Bleu(4), ['Bleu_1', 'Bleu_2', 'Bleu_3', 'Bleu_4']),
             (Rouge(), 'ROUGE_L'), (Cider(), 'CIDEr')]

  def preprocess(text):
    return text.lower().rstrip(' .').strip()

  common_keys = [k for k in predicted_answers if k in ground_truth]
  refs = {k: [preprocess(s) for s in ground_truth[k]] for k in common_keys}
  hyps = {k: [preprocess(predicted_answers[k])] for k in common_keys}

  ret_scores = dict(common=len(common_keys))
  for scorer, method in scorers:
    score, scores = scorer.compute_score(refs, hyps)
    if isinstance(method, list):
      for sc, _, m in zip(score, scores, method):
        # print('%s: %0.6f' % (m, sc))
        ret_scores[m] = sc * 100
    else:
      # print('%s: %0.6f' % (method, score))
      ret_scores[method] = score * 100
    if isinstance(scorer, Meteor):
      scorer.close()
  del scorers
  return ret_scores
Esempio n. 2
0
def compute_individual_metrics(ref,
                               hyp,
                               no_overlap=False,
                               no_skipthoughts=True,
                               no_glove=False):
    assert isinstance(hyp, six.string_types)

    if isinstance(ref, six.string_types):
        ref = ref.split(
            '||<|>||')  # special delimiter for backward compatibility
    ref = [a.strip() for a in ref]
    refs = {0: ref}
    ref_list = [ref]

    hyps = {0: [hyp.strip()]}
    hyp_list = [hyp]

    ret_scores = {}
    if not no_overlap:
        scorers = [(Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]),
                   (Meteor(), "METEOR"), (Rouge(), "ROUGE_L"),
                   (Cider(), "CIDEr")]
        for scorer, method in scorers:
            score, scores = scorer.compute_score(refs, hyps)
            if isinstance(method, list):
                for sc, scs, m in zip(score, scores, method):
                    ret_scores[m] = sc
            else:
                ret_scores[method] = score

    return ret_scores
Esempio n. 3
0
def compute_individual_metrics(ref, hyp, no_overlap=False, no_skipthoughts=False, no_glove=False):
    assert isinstance(hyp, six.string_types)

    if isinstance(ref, six.string_types):
        ref = ref.split('||<|>||')  # special delimiter for backward compatibility
    ref = [a.strip() for a in ref]
    refs = {0: ref}
    ref_list = [ref]

    hyps = {0: [hyp.strip()]}
    hyp_list = [hyp]

    ret_scores = {}
    if not no_overlap:
        scorers = [
            (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]),
            (Meteor(), "METEOR"),
            (Rouge(), "ROUGE_L"),
            (Cider(), "CIDEr")
        ]
        for scorer, method in scorers:
            score, scores = scorer.compute_score(refs, hyps)
            if isinstance(method, list):
                for sc, scs, m in zip(score, scores, method):
                    ret_scores[m] = sc
            else:
                ret_scores[method] = score
            if isinstance(scorer, Meteor):
                scorer.close()
        del scorers

    if not no_skipthoughts:
        from nlgeval.skipthoughts import skipthoughts
        import numpy as np
        from sklearn.metrics.pairwise import cosine_similarity

        model = skipthoughts.load_model()
        encoder = skipthoughts.Encoder(model)
        vector_hyps = encoder.encode([h.strip() for h in hyp_list], verbose=False)
        ref_list_T = np.array(ref_list).T.tolist()
        vector_refs = map(lambda refl: encoder.encode([r.strip() for r in refl], verbose=False), ref_list_T)
        cosine_similarity = list(map(lambda refv: cosine_similarity(refv, vector_hyps).diagonal(), vector_refs))
        cosine_similarity = np.max(cosine_similarity, axis=0).mean()
        ret_scores['SkipThoughtCS'] = cosine_similarity

    if not no_glove:
        from nlgeval.word2vec.evaluate import eval_emb_metrics
        import numpy as np

        glove_hyps = [h.strip() for h in hyp_list]
        ref_list_T = np.array(ref_list).T.tolist()
        glove_refs = map(lambda refl: [r.strip() for r in refl], ref_list_T)
        scores = eval_emb_metrics(glove_hyps, glove_refs)
        scores = scores.split('\n')
        for score in scores:
            name, value = score.split(':')
            value = float(value.strip())
            ret_scores[name] = value

    return ret_scores
Esempio n. 4
0
def compute_metrics(hypothesis,
                    references,
                    no_overlap=False,
                    no_skipthoughts=True,
                    no_glove=False):
    with open(hypothesis, 'r') as f:
        hyp_list = f.readlines()
    ref_list = []
    for iidx, reference in enumerate(references):
        with open(reference, 'r') as f:
            ref_list.append(f.readlines())
    ref_list = [list(map(_strip, refs)) for refs in zip(*ref_list)]
    refs = {idx: strippedlines for (idx, strippedlines) in enumerate(ref_list)}
    hyps = {idx: [lines.strip()] for (idx, lines) in enumerate(hyp_list)}
    assert len(refs) == len(hyps)

    ret_scores = {}
    if not no_overlap:
        scorers = [(Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]),
                   (Meteor(), "METEOR"), (Rouge(), "ROUGE_L"),
                   (Cider(), "CIDEr")]
        for scorer, method in scorers:
            score, scores = scorer.compute_score(refs, hyps)
            if isinstance(method, list):
                for sc, scs, m in zip(score, scores, method):
                    print("%s: %0.6f" % (m, sc))
                    ret_scores[m] = sc
            else:
                print("%s: %0.6f" % (method, score))
                ret_scores[method] = score
        del scorers

    return ret_scores
Esempio n. 5
0
 def load_scorers(self):
     self.scorers = [
         (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]),
         (Meteor(), "METEOR"),
         (Rouge(), "ROUGE_L"),
         (Cider(), "CIDEr")
     ]
Esempio n. 6
0
    def load_scorers(self):
        self.scorers = []

        omit_bleu_i = False
        for i in range(1, 4 + 1):
            if 'Bleu_{}'.format(i) in self.metrics_to_omit:
                omit_bleu_i = True
                if i > 1:
                    self.scorers.append((Bleu(i - 1), ['Bleu_{}'.format(j) for j in range(1, i)]))
                break
        if not omit_bleu_i:
            self.scorers.append((Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]))

        if 'ROUGE_L' not in self.metrics_to_omit:
            self.scorers.append((Rouge(), "ROUGE_L"))
        if 'CIDEr' not in self.metrics_to_omit:
            self.scorers.append((Cider(), "CIDEr"))
Esempio n. 7
0
def compute_metrics(ref, hyp):
    # ref = ref.split('||<|>||')  # special delimiter
    #ref = [a.strip() for a in ref]
    refs = {0: [ref]}
    #ref_list = [ref]

    hyps = {0: [hyp.strip()]}
    hyp_list = [hyp]

    ret_scores = {}
    scorers = [(Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]),
               (Meteor(), "METEOR"), (Rouge(), "ROUGE_L"), (Cider(), "CIDEr")]
    for scorer, method in scorers:
        score, scores = scorer.compute_score(refs, hyps)
        if isinstance(method, list):
            for sc, scs, m in zip(score, scores, method):
                ret_scores[m] = sc
        else:
            ret_scores[method] = score
    return ret_scores
Esempio n. 8
0
def compute_metrics_all(references, hypothesises):
    refs = {
        idx: [strippedlines.strip()]
        for (idx, strippedlines) in enumerate(references)
    }
    hyps = {idx: [lines.strip()] for (idx, lines) in enumerate(hypothesises)}
    assert len(refs) == len(hyps)

    ret_scores = {}
    scorers = [(Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]),
               (Meteor(), "METEOR"), (Rouge(), "ROUGE_L"), (Cider(), "CIDEr")]
    for scorer, method in scorers:
        score, scores = scorer.compute_score(refs, hyps)
        if isinstance(method, list):
            for sc, scs, m in zip(score, scores, method):
                #print("%s: %0.6f" % (m, sc))
                ret_scores[m] = sc
        else:
            #print("%s: %0.6f" % (method, score))
            ret_scores[method] = score
    return ret_scores
Esempio n. 9
0
def compute_metrics(gt_caps, pred_caps):
    assert len(gt_caps) == len(pred_caps)
    gt_caps = add_space_to_cap_dict(gt_caps)
    pred_caps = add_space_to_cap_dict(pred_caps)

    ret_scores = {}
    scorers = [(Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]),
               (Meteor(), "METEOR"), (Rouge(), "ROUGE_L"), (Cider(), "CIDEr")]
    for scorer, method in scorers:
        score, scores = scorer.compute_score(gt_caps, pred_caps)
        if isinstance(method, list):
            for sc, scs, m in zip(score, scores, method):
                print("%s: %0.6f" % (m, sc))
                ret_scores[m] = sc
        else:
            print("%s: %0.6f" % (method, score))
            ret_scores[method] = score
        if isinstance(scorer, Meteor):
            scorer.close()
    del scorers
    return ret_scores
Esempio n. 10
0
def compute_metrics_by_file(references, hypothesis):
    """
    Given a list of gold file names and a predict result file,
    calculate metrics. Same line number corresponds to the same
    instance to calculate metric.
    Ref: https://github.com/Maluuba/nlg-eval
    :param references: list of gold file names.
    :param hypothesis: predict file name.
    :return: a list of metric results.
    """
    scorers = [(Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]),
               (Meteor(), "METEOR"), (Rouge(), "ROUGE_L"), (Cider(), "CIDEr")]

    def _strip(s):
        return s.strip()

    with open(hypothesis, encoding='utf-8') as f:
        hyp_list = f.readlines()
    ref_list = []
    for iidx, reference in enumerate(references):
        with open(reference, encoding='utf-8') as f:
            ref_list.append(f.readlines())
    ref_list = [list(map(_strip, refs)) for refs in zip(*ref_list)]
    refs = {idx: strippedlines for (idx, strippedlines) in enumerate(ref_list)}
    hyps = {idx: [lines.strip()] for (idx, lines) in enumerate(hyp_list)}
    assert len(refs) == len(hyps)

    ret_scores = {}

    for scorer, method in scorers:
        score, scores = scorer.compute_score(refs, hyps)
        if isinstance(method, list):
            for sc, scs, m in zip(score, scores, method):
                # print("%s: %0.6f" % (m, sc))
                ret_scores[m] = sc
        else:
            # print("%s: %0.6f" % (method, score))
            ret_scores[method] = score

    return ret_scores
Esempio n. 11
0
def compute_metrics(hypothesis,
                    references,
                    no_overlap=False,
                    no_skipthoughts=False,
                    no_glove=False):
    with open(hypothesis, 'r') as f:
        hyp_list = f.readlines()
    ref_list = []
    for iidx, reference in enumerate(references):
        with open(reference, 'r') as f:
            ref_list.append(f.readlines())
    ref_list = [list(map(_strip, refs)) for refs in zip(*ref_list)]
    refs = {idx: strippedlines for (idx, strippedlines) in enumerate(ref_list)}
    hyps = {idx: [lines.strip()] for (idx, lines) in enumerate(hyp_list)}
    assert len(refs) == len(hyps)

    ret_scores = {}
    if not no_overlap:
        scorers = [(Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]),
                   (Meteor(), "METEOR"), (Rouge(), "ROUGE_L"),
                   (Cider(), "CIDEr")]
        for scorer, method in scorers:
            score, scores = scorer.compute_score(refs, hyps)
            if isinstance(method, list):
                for sc, scs, m in zip(score, scores, method):
                    print("%s: %0.6f" % (m, sc))
                    ret_scores[m] = sc
            else:
                print("%s: %0.6f" % (method, score))
                ret_scores[method] = score
            if isinstance(scorer, Meteor):
                scorer.close()
        del scorers

    if not no_skipthoughts:
        from nlgeval.skipthoughts import skipthoughts
        import numpy as np
        from sklearn.metrics.pairwise import cosine_similarity

        model = skipthoughts.load_model()
        encoder = skipthoughts.Encoder(model)
        vector_hyps = encoder.encode([h.strip() for h in hyp_list],
                                     verbose=False)
        ref_list_T = np.array(ref_list).T.tolist()
        vector_refs = map(
            lambda refl: encoder.encode([r.strip() for r in refl],
                                        verbose=False), ref_list_T)
        cosine_similarity = list(
            map(lambda refv: cosine_similarity(refv, vector_hyps).diagonal(),
                vector_refs))
        cosine_similarity = np.max(cosine_similarity, axis=0).mean()
        print("SkipThoughtsCosineSimilarity: %0.6f" % (cosine_similarity))
        ret_scores['SkipThoughtCS'] = cosine_similarity
        del model

    if not no_glove:
        from nlgeval.word2vec.evaluate import eval_emb_metrics
        import numpy as np

        glove_hyps = [h.strip() for h in hyp_list]
        ref_list_T = np.array(ref_list).T.tolist()
        glove_refs = map(lambda refl: [r.strip() for r in refl], ref_list_T)
        scores = eval_emb_metrics(glove_hyps, glove_refs)
        print(scores)
        scores = scores.split('\n')
        for score in scores:
            name, value = score.split(':')
            value = float(value.strip())
            ret_scores[name] = value

    return ret_scores
Esempio n. 12
0
def compute_metrics(hypothesis, references, no_overlap=False, no_skipthoughts=False, no_glove=False):
    with open(hypothesis, 'r') as f:
        hyp_list = f.readlines()
    ref_list = []
    for iidx, reference in enumerate(references):
        with open(reference, 'r') as f:
            ref_list.append(f.readlines())
    ref_list = [map(str.strip, refs) for refs in zip(*ref_list)]
    refs = {idx: strippedlines for (idx, strippedlines) in enumerate(ref_list)}
    hyps = {idx: [lines.strip()] for (idx, lines) in enumerate(hyp_list)}
    assert len(refs) == len(hyps)

    ret_scores = {}
    ret1_scores={}
    if not no_overlap:
        scorers = [
            (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]),
            (Meteor(), "METEOR"),
            (Rouge(), "ROUGE_L"),
            (Cider(), "CIDEr")
        ]
        for scorer, method in scorers:
            score, scores = scorer.compute_score(refs, hyps)
            if isinstance(method, list):
                 for sc, scs, m in zip(score, scores, method):
                    # print("First print: %s:" %m)
                    # print("%s: %0.6f" % (m, sc))//giving BLEu scores
                    ret1_scores[m] = sc
            else:
               # print("Second print: %s: "%method)
                #print("%s: %0.6f" % (method, score))//gives meteor,rouge_l and cider
                ret_scores[method] = score
                #print(type(ret_scores))

    # if not no_skipthoughts:
    #     from nlgeval.skipthoughts import skipthoughts
    #     import numpy as np
    #     from sklearn.metrics.pairwise import cosine_similarity

    #     model = skipthoughts.load_model()
    #     encoder = skipthoughts.Encoder(model)
    #     vector_hyps = encoder.encode([h.strip() for h in hyp_list], verbose=False)
    #     ref_list_T = np.array(ref_list).T.tolist()
    #     vector_refs = map(lambda refl: encoder.encode([r.strip() for r in refl], verbose=False), ref_list_T)
    #     cosine_similarity = map(lambda refv: cosine_similarity(refv, vector_hyps).diagonal(), vector_refs)
    #     cosine_similarity = np.max(cosine_similarity, axis=0).mean()
    #     print("SkipThoughtsCosineSimilairty: %0.6f" % (cosine_similarity))
    #     ret_scores['SkipThoughtCS'] = cosine_similarity

    

    

    if not no_glove:
        from nlgeval.word2vec.evaluate import eval_emb_metrics
        import numpy as np

        glove_hyps = [h.strip() for h in hyp_list]
        ref_list_T = np.array(ref_list).T.tolist()
        glove_refs = map(lambda refl: [r.strip() for r in refl], ref_list_T)
        scores = eval_emb_metrics(glove_hyps, glove_refs)
        #print(scores)
        scores = scores.split('\n')
        for score in scores:
            name, value = score.split(':')
            value = float(value.strip())
            ret_scores[name] = value

    # return ret_scores

    ret_scores["METEOR"]=ret_scores["METEOR"]*a
    ret_scores["ROUGE_L"]=ret_scores["ROUGE_L"]*b
    ret_scores["CIDEr"]=ret_scores["CIDEr"]*c
    ret_scores["EmbeddingAverageCosineSimilairty"]=ret_scores["EmbeddingAverageCosineSimilairty"]*d
    ret_scores["VectorExtremaCosineSimilarity"]=ret_scores["VectorExtremaCosineSimilarity"]*e
   # ret_scores["GreedyMatchingScore"]=ret_scores["GreedyMatchingScore"]*f

    sum=0
    # for key in ret_scores:
    #     sum=sum+ret_scores[key]

    sum=ret_scores["METEOR"]+ret_scores["ROUGE_L"]+ret_scores["CIDEr"]+ret_scores["EmbeddingAverageCosineSimilairty"]+ret_scores["VectorExtremaCosineSimilarity"]

    marks=sum*maximum_marks
    
    print("Marks: %0.2f" % marks)
Esempio n. 13
0
    def white_space_fix(text):
        return ' '.join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return ''.join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))


meteor_scorer = Meteor()
rouge_scorer = Rouge()


def ans_score(ans, gold_list):
    ans = normalize_answer(ans)
    gold_list = [normalize_answer(ref) for ref in gold_list]
    bleu = sentence_bleu([_.split() for _ in gold_list],
                         ans.split(),
                         weights=(1, 0, 0, 0))
    meteor, _ = meteor_scorer.compute_score({0: gold_list}, {0: [ans]})
    rouge, _ = rouge_scorer.compute_score({0: gold_list}, {0: [ans]})
    return {'bleu': bleu, 'meteor': meteor, 'rouge': rouge}


def evaluate(test_annotation_file, user_annotation_file, phase_codename,
             **kwargs):