Exemple #1
0
def vg_score(text, img_path, vg_attributes, vg_relations, visual_context):
    target_attributes = vg_attributes[img_path]
    target_relations = vg_relations[img_path]

    all_features = target_attributes | target_relations

    visual_context -= {img_path}

    confounding_attributes = set()
    confounding_relations = set()
    for path in visual_context:
        confounding_attributes |= vg_attributes[path]
        confounding_relations |= vg_relations[path]

    discriminative_attributes = target_attributes - confounding_attributes
    discriminative_relations = target_relations - confounding_relations

    discriminative_features = discriminative_attributes | discriminative_relations
    meteor_score = meteor(' '.join(discriminative_features), text, gamma=0)

    return meteor_score, discriminative_features, all_features
def benchmark(corpus_candidates: np.ndarray, corpus_references: np.ndarray):
    corpus_candidates_split = [
        candidate.strip().split(' ') for candidate in corpus_candidates
    ]
    corpus_references_split = [[
        reference.strip().split(' ') for reference in refs
    ] for refs in corpus_references]
    bleu_1 = 100 * bleu.corpus_bleu(
        corpus_references_split, corpus_candidates_split, weights=(1.0, ))
    print(f"BLEU-1: {bleu_1}")
    bleu_2 = 100 * bleu.corpus_bleu(
        corpus_references_split, corpus_candidates_split, weights=(0.5, 0.5))
    print(f"BLEU-2: {bleu_2}")
    bleu_3 = 100 * bleu.corpus_bleu(corpus_references_split,
                                    corpus_candidates_split,
                                    weights=(1.0 / 3, 1.0 / 3, 1.0 / 3))
    print(f"BLEU-3: {bleu_3}")
    bleu_4 = 100 * bleu.corpus_bleu(corpus_references_split,
                                    corpus_candidates_split,
                                    weights=(0.25, 0.25, 0.25, 0.25))
    print(f"BLEU-4: {bleu_4}")
    # Sentences level ROUGE-L with beta = P_lcs / (R_lcs + 1e-12)
    rouge_l_sentence_level = 100 * rouge_l(corpus_candidates_split,
                                           corpus_references_split)
    print(f"ROUGE-L: {rouge_l_sentence_level}")
    meteor_score = 100 * np.mean(
        np.array([
            meteor(references, candidate)
            for (references,
                 candidate) in zip(corpus_references, corpus_candidates)
        ]))
    # meteor_score = Meteor().compute_score(
    #     gts={i: corpus_references[i] for i in range(len(corpus_references))},
    #     res={i: [corpus_candidates[i]] for i in range(len(corpus_candidates))}
    # )
    print(f"METEOR: {meteor_score}")
    f1_score = 100 * corpus_f1_score(corpus_candidates_split,
                                     corpus_references_split)
    print(f"F1 macro average: {f1_score}")
def benchmark(corpus_candidates: np.ndarray, corpus_references: np.ndarray):
    corpus_candidates_split = [
        candidate.strip().split(' ') for candidate in corpus_candidates
    ]
    corpus_references_split = [[
        reference.strip().split(' ') for reference in refs
    ] for refs in corpus_references]
    bleu_1 = bleu.corpus_bleu(corpus_references_split,
                              corpus_candidates_split,
                              weights=(1.0, ))
    print(f"BLEU-1: {bleu_1}")
    bleu_2 = bleu.corpus_bleu(corpus_references_split,
                              corpus_candidates_split,
                              weights=(0.5, 0.5))
    print(f"BLEU-2: {bleu_2}")
    bleu_3 = bleu.corpus_bleu(corpus_references_split,
                              corpus_candidates_split,
                              weights=(1.0 / 3, 1.0 / 3, 1.0 / 3))
    print(f"BLEU-3: {bleu_3}")
    bleu_4 = bleu.corpus_bleu(corpus_references_split,
                              corpus_candidates_split,
                              weights=(0.25, 0.25, 0.25, 0.25))
    print(f"BLEU-4: {bleu_4}")
    # Sentences level ROUGE-L with beta = P_lcs / (R_lcs + 1e-12)
    rouge_l_sentence_level = rouge_l(corpus_candidates_split,
                                     corpus_references_split)
    print(f"ROUGE-L: {rouge_l_sentence_level}")
    meteor_score = np.mean(
        np.array([
            meteor(references, candidate)
            for (references,
                 candidate) in zip(corpus_references, corpus_candidates)
        ]))
    print(f"METEOR macro average: {meteor_score}")
    f1_score = corpus_f1_score(corpus_candidates_split,
                               corpus_references_split)
    print(f"F1 macro average: {f1_score}")
def evaluate(args):
    metrics = args.metrics.lower().split(',')

    sentences = defaultdict(list)
    with open(args.generated) as f:
        lines = [line.strip().split('\t') for line in f]
        lines = [(int(row[0]), row[1]) for row in lines]
        for idx, sent in lines:
            sentences[idx].append(sent)
    logging.debug("Example generated sentences: {}".format(sentences[0]))
    logging.debug("Read {} generated sentences".format(len(sentences)))

    with open(args.ground_truth) as f:
        gt_sentences = [line.strip() for line in f]
    logging.debug("Example gt sentences: {}".format(gt_sentences[0]))
    logging.debug("Read {} gt sentences".format(len(gt_sentences)))

    if args.toy is True:
        gt_sentences = gt_sentences[:4]
        sentences = {i: sentences[i] for i in range(4)}

    os.makedirs(os.path.dirname(args.save), exist_ok=True)
    with open(args.save, 'w') as f:
        f.write("Generated sentences file: {}\n".format(args.generated))
        if args.ground_truth is not None:
            f.write("Ground truth file: {}\n".format(args.ground_truth))

        cnt = len(sentences)
        if 'meteor' in metrics:
            logging.debug("START EVALUATION: METEOR")

            # Calculate METEOR score for each paraphrases
            meteor_scores = defaultdict(list)
            for idx, candidates in sentences.items():
                gt = gt_sentences[idx]
                for cand in candidates:
                    score = meteor(gt, cand)
                    meteor_scores[idx].append((score, cand))
            logging.debug("Example METEOR scores: {}".format(meteor_scores[0]))

            # Get the best METEOR score for each input
            for key in meteor_scores.keys():
                meteor_scores[key].sort(key=lambda row: -row[0])
            best_score = sum([slist[0][0]
                              for slist in meteor_scores.values()]) / cnt
            logging.info("Best METEOR:  {}".format(best_score))
            f.write("Best METEOR:  {:.4f}\n".format(best_score))

            # Get top 3 METEOR scores for each input
            top3_score = sum([
                sum([score for score, _ in row[:3]]) / len(row[:3])
                for row in meteor_scores.values()
            ]) / cnt
            logging.debug("Example top 3 METEOR scores: {}".format(
                meteor_scores[0][:3]))
            logging.info("Top 3 METEOR: {}".format(top3_score))
            f.write("Top 3 METEOR: {:.4f}\n".format(top3_score))

            if 'self-bleu' in metrics:
                logging.debug("START EVALUATION: Self-BLEU")

                # Self-BLEU among top 3 paraphrases
                sbleu = 0
                weights = {'4gram': (0.25, 0.25, 0.25, 0.25)}
                for val in meteor_scores.values():
                    refs = [sent for _, sent in val[:3]]
                    calculator = SelfBLEU(refs, weights=weights)
                    score_list = calculator.get_score()['4gram']
                    sbleu += sum(score_list) / len(score_list)
                logging.info("self-BLEU among top 3: {}".format(sbleu / cnt))
                f.write("self-BLEU among top 3: {:.4f}\n".format(sbleu / cnt))

        if 'rouge' in metrics:
            logging.debug("START EVALUATION: ROUGE")

            # Calculate ROUGE score for each paraphrases
            rouge1_scores = defaultdict(list)
            rouge2_scores = defaultdict(list)
            rouge = rouge_scorer.RougeScorer(['rouge1', 'rouge2'],
                                             use_stemmer=True)
            for idx, candidates in sentences.items():
                gt = gt_sentences[idx]
                for cand in candidates:
                    scores = rouge.score(gt, cand)
                    rouge1_scores[idx].append(
                        (scores['rouge1'].fmeasure, cand))
                    rouge2_scores[idx].append(
                        (scores['rouge2'].fmeasure, cand))
            logging.debug("Example ROUGE-1 scores: {}".format(
                rouge1_scores[0]))
            logging.debug("Example ROUGE-2 scores: {}".format(
                rouge2_scores[0]))

            # Get the best ROUGE score for each input
            for key in rouge1_scores.keys():
                rouge1_scores[key].sort(key=lambda row: -row[0])
            for key in rouge2_scores.keys():
                rouge2_scores[key].sort(key=lambda row: -row[0])
            best_rouge1 = sum(
                [slist[0][0] for slist in rouge1_scores.values()]) / cnt
            best_rouge2 = sum(
                [slist[0][0] for slist in rouge2_scores.values()]) / cnt
            logging.info("Best ROUGE-1: {}".format(best_rouge1))
            logging.info("Best ROUGE-2: {}".format(best_rouge2))
            f.write("Best ROUGE-1: {:.4f}\n".format(best_rouge1))
            f.write("Best ROUGE-2: {:.4f}\n".format(best_rouge2))

            # Get top 3 ROUGE scores for each input
            top3_rouge1 = sum([
                sum([score for score, _ in row[:3]]) / len(row[:3])
                for row in rouge1_scores.values()
            ]) / cnt
            top3_rouge2 = sum([
                sum([score for score, _ in row[:3]]) / len(row[:3])
                for row in rouge2_scores.values()
            ]) / cnt
            logging.debug("Example top 3 ROUGE-1 scores: {}".format(
                rouge1_scores[0][:3]))
            logging.debug("Example top 3 ROUGE-2 scores: {}".format(
                rouge2_scores[0][:3]))
            logging.info("Top 3 ROUGE-1: {}".format(top3_rouge1))
            logging.info("Top 3 ROUGE-2: {}".format(top3_rouge2))
            f.write("Top 3 ROUGE-1: {:.4f}\n".format(top3_rouge1))
            f.write("Top 3 ROUGE-2: {:.4f}\n".format(top3_rouge2))
        logging.debug("DONE EVALUATION")