Esempio n. 1
0
    def get_metrics(self, ref_dir, dec_dir):
        reference = []
        decoded = []

        for i, j in zip(sorted(glob.glob(dec_dir + '/' + '*.txt')),
                        sorted(glob.glob(ref_dir + '/' + '*.txt'))):
            ref_tex = ''
            dec_tex = ''

            for k in open(i).readlines():
                dec_tex = dec_text + k.strip()
                if len(dec_tex) == 0:
                    dec_tex = ' '

            for l in open(j).readlines():
                ref_tex = ref_tex + l

            reference.append(ref_tex)
            decoded.append(dec_tex)

        if len(reference) != len(decoded):
            raise ValueError(
                "Hypotheses and References don't have equal lengths")

        rouge_dict = rouge.rouge(decoded, reference)
        file_path = os.path.join(self._decode_dir, 'results.txt')
        f = open(file_path, 'w')
        for key in rouge_dict:
            print("%s\t%f" % (key, rouge_dict[key]), file=f)
        bleu_score = bleu.moses_multi_bleu(decoded, reference)
        print("%s\t%f" % ('bleu', bleu_score), file=f)
        tf.logging.info("BLEU, ROUGE values saved to results.txt")
Esempio n. 2
0
def moses_bl_rouge(p, l):
    bl = bleu.moses_multi_bleu(p, l)
    x = rouge.rouge(p, l)
    print(
        'Moses BLEU: %f\nROUGE1-F: %f\nROUGE1-P: %f\nROUGE1-R: %f\nROUGE2-F: %f\nROUGE2-P: %f\nROUGE2-R: %f\nROUGEL-F: %f\nROUGEL-P: %f\nROUGEL-R: %f'
        %
        (bl, x['rouge_1/f_score'], x['rouge_1/p_score'], x['rouge_1/r_score'],
         x['rouge_2/f_score'], x['rouge_2/p_score'], x['rouge_2/r_score'],
         x['rouge_l/f_score'], x['rouge_l/p_score'], x['rouge_l/r_score']))
Esempio n. 3
0
def np_rouge(val, ref, start, end):
    def trim_seq(seq, start, end):
        seq = seq[list(seq).index(start) + 1:] if start in seq else seq
        seq = seq[:list(seq).index(end)] if end in seq else seq
        return np.trim_zeros(seq, 'b')

    val, ref = list(val), list(ref)
    for i in range(len(val)):
        val[i] = " ".join(str(c) for c in trim_seq(val[i], start, end))
        ref[i] = " ".join(str(c) for c in trim_seq(ref[i], start, end))
    return rouge(val, ref)
Esempio n. 4
0
def _rouge(ref_file, summarization_file, mode="brief"):
    """Compute ROUGE scores and handling BPE."""

    results = {}

    references = []
    role_tokens = []
    with codecs.getreader("utf-8")(tf.gfile.GFile(ref_file, "rb")) as fh:
        for line in fh:
            ref, role = process_dialogue_infer(line.rstrip(),
                                               get_role_token=True)
            references.append(ref)
            role_tokens.append(role)

    hypotheses = []
    with codecs.getreader("utf-8")(tf.gfile.GFile(summarization_file,
                                                  "rb")) as fh:
        for line in fh:
            hypotheses.append(line)

    rouge_score_map = rouge.rouge(hypotheses, references)
    results["all"] = 100 * rouge_score_map["rouge_l/f_score"]
    if mode == "brief":
        return results["all"]

    for role in ROLE_TOKENS:
        _sub_ref_texts = []
        _sub_hypos = []
        for _r, _t, _role in zip(references, hypotheses, role_tokens):
            if _role == role:
                _sub_ref_texts.append(_r)
                _sub_hypos.append(_t)
        rouge_score_map = rouge.rouge(_sub_hypos, _sub_ref_texts)
        results[role] = 100 * rouge_score_map["rouge_l/f_score"]

    return results
Esempio n. 5
0
def _rouge(ref_file, summarization_file):
    """Compute ROUGE scores and handling BPE."""

    references = []
    with codecs.getreader("utf-8")(tf.gfile.GFile(ref_file, "rb")) as fh:
        for line in fh:
            references.append(process_dialogue_infer(line))

    hypotheses = []
    with codecs.getreader("utf-8")(tf.gfile.GFile(summarization_file,
                                                  "rb")) as fh:
        for line in fh:
            hypotheses.append(line)

    rouge_score_map = rouge.rouge(hypotheses, references)
    return 100 * rouge_score_map["rouge_l/f_score"]
Esempio n. 6
0
def evaluate(dataset_f, predictions_f, all_metrics=False, save_dir=""):
    with open(dataset_f) as dataset_file:
        dataset_json = json.load(dataset_file)
        dataset = dataset_json['data']
    with open(predictions_f) as prediction_file:
        predictions = json.load(prediction_file)
    gt = []
    pred = []
    f1 = exact_match = total = count = 0
    for article in dataset:
        for paragraph in article['paragraphs']:
            if str(
                    article['title']
            ) not in predictions:  #needs a lookup in case of dev-v1.1.json
                continue

            for qa in paragraph['qas']:
                total += 1

                ground_truths = list(map(lambda x: x['text'], qa['answers']))
                if str(qa['id']) not in predictions:
                    prediction = ""
                else:
                    prediction = predictions[str(qa['id'])]
                if prediction == "":
                    prediction = 'n_a'
                gt.append(ground_truths[0])
                pred.append(prediction)
                exact_match += metric_max_over_ground_truths(
                    exact_match_score, prediction, ground_truths)
                f1 += metric_max_over_ground_truths(f1_score, prediction,
                                                    ground_truths)

    exact_match = 100.0 * exact_match / total
    f1 = 100.0 * f1 / total
    if all_metrics:
        rouge_dict = rouge(pred, gt)
        file_path = os.path.join(save_dir, 'results.txt')
        f = open(file_path, 'w')
        for key in rouge_dict:
            print("%s\t%f" % (key, rouge_dict[key]), file=f)
        bleu_score = moses_multi_bleu(pred, gt)
        print("%s\t%f" % ('bleu', bleu_score), file=f)
        print("%s\t%f" % ('f1', f1), file=f)
        print("%s\t%f" % ('exact_match', exact_match), file=f)

    return exact_match, f1
Esempio n. 7
0
def target_based_np_rouge(val, ref, start, end, tz=True):
    def trim_seqs(val, ref, start, end):
        start_idx = list(ref).index(start) + 1 if start in ref else 0
        val = val[start_idx:]
        ref = ref[start_idx:]
        val = val[:list(val).index(end)] if end in val else val
        ref = ref[:list(ref).index(end)] if end in ref else ref
        if tz:
            val = np.trim_zeros(val, 'b')
            ref = np.trim_zeros(ref, 'b')
        return val, ref

    val, ref = list(val), list(ref)
    for i in range(len(val)):
        sval, sref = trim_seqs(val[i], ref[i], start, end)
        sval = " ".join(str(c) for c in sval)
        sref = " ".join(str(c) for c in sref)
        val[i] = sval
        ref[i] = sref
    return rouge(val, ref)
def calculate_metrics_results(results: dict):
    for data in results.keys():
        references = []
        translations = []
        references_rouge = []
        translations_rouge = []
        wert = 0.0
        meteort = 0.0
        for video in results[data].keys():
            translation = results[data][video]["prediction_sentence"]
            if '</s>' in translation:
                translation.remove('</s>')
            translation = " ".join(translation)

            reference = results[data][video]["target_sentence"]
            if '</s>' in reference:
                reference.remove('</s>')
            reference = " ".join(reference)

            wert += jiwer.wer(truth=reference, hypothesis=translation)
            meteort += single_meteor_score(reference, translation)

            translations.append(translation.split(" "))
            translations_rouge.append(translation)

            references.append([reference.split(" ")])
            references_rouge.append(reference)

        print(len(references))
        rouge_score_map = rouge.rouge(translations_rouge, references_rouge)
        print(data + ' rouge: ' +
              str(100 * rouge_score_map["rouge_l/f_score"]))
        print(data + ' WER: ' + str((wert / len(references)) * 100))
        print(data + ' Meteor: ' + str((meteort / len(references)) * 100))
        for max_ in range(1, 5):
            bleu_score, _, _, _, _, _ = bleu.compute_bleu(references,
                                                          translations,
                                                          max_order=max_)
            print(data + ' bleu: ' + str(max_) + " " + str(bleu_score * 100))
Esempio n. 9
0
def evaluate(infer, ref, inferred_spans, ref_spans):
    bl = cal_bleu(infer, ref)
    x = rouge.rouge(infer, ref)
    f, e, total = f1.evaluate(inferred_spans, ref_spans)
    return bl, x['rouge_1/f_score'] * 100, x['rouge_2/f_score'] * 100, x[
        'rouge_l/f_score'] * 100, f, e, total
Esempio n. 10
0
def cal_rouge(infer, ref):
    x = rouge.rouge(infer, ref)
    return x['rouge_1/f_score'] * 100, x['rouge_2/f_score'] * 100, x[
        'rouge_l/f_score'] * 100