def bleu_similarity(s, t):
    return compute_bleu([[s]], [t])[0]
    s = remove_tags(s)
    t = remove_tags(t)
    if len(s) == 0 or len(t) == 0:
        return 0

    bigrams_s = list(zip(s[:-1], s[1:]))
    bigram_t = list(zip(t[:-1], t[1:]))

    if len(bigrams_s) == 0:
        return 0

    overlap = 0
    for bigram in bigrams_s:
        overlap += 1 if bigram in bigram_t else 0
    return overlap / len(bigrams_s)


def bleu_similarity(s, t):
    return compute_bleu([[s]], [t])[0]


if __name__ == "__main__":
    print("Test")
    s = [1, 4, 140, 36, 6, 4, 31, 28, 4, 163, 2]
    t = [1, 4, 140, 36, 6, 4, 31, 28, 4, 3, 2]
    print(bleu_similarity(s, t))
    print(compute_bleu([[s]], [t])[0])

    print("Bigrams")
    print(bigram_overlap(s, t))
Exemple #3
0
def main(argv_custom): #argv_custom):

    sys.argv = argv_custom # Nic change ...

    p = argparse.ArgumentParser(description="Evaluator for CoNaLa",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    p.add_argument("--input_dir",
                   help="input directory, containing 'res/answer.txt' and 'ref/truth.txt'",
                   default=None)
    p.add_argument("--input_ref",
                   help="input reference file",
                   default=None)
    p.add_argument("--input_hyp",
                   help="input hypothesis file",
                   default=None)
    p.add_argument("--output_file",
                   help="output score file",
                   default=None)
    p.add_argument("--output_dir",
                   help="output score directory which will contain output_dir/scores.txt",
                   default=None)
    p.add_argument("--no_exact_match",
                   help="only output bleu scores and not exact_match score",
                   action="store_true")
    p.add_argument("--strip_ref_metadata",
                   help="strip metadata from the reference and get only the code",
                   action="store_true")

    #args = sys.argv
    args = p.parse_args()

    if not (args.input_dir or (args.input_ref and args.input_hyp)):
        raise ValueError("Must specify input_dir or input_ref+input_hyp")

    input_hyp = args.input_hyp if args.input_hyp else os.path.join(args.input_dir, 'res', 'answer.txt')
    input_ref = args.input_ref if args.input_ref else os.path.join(args.input_dir, 'ref', 'truth.txt')

    with open(input_hyp, 'r') as f_hyp:
        c_hyp = json.load(f_hyp)
        c_hyp = [tokenize_for_bleu_eval(s) for s in c_hyp]
    with open(input_ref, 'r') as f_ref:
        c_ref = json.load(f_ref)
        if args.strip_ref_metadata:
          c_ref = [x['snippet'] for x in c_ref]
        c_ref = [tokenize_for_bleu_eval(s) for s in c_ref]

    if len(c_hyp) != len(c_ref):
        raise ValueError('Length of hypothesis and reference don\'t match: {} != {}'.format(len(c_hyp), len(c_ref)))

    if args.output_file:
        f_out = open(args.output_file, 'w')
    elif args.output_dir:
        f_out = open(os.path.join(args.output_dir, 'scores.txt'), 'w')
    else:
        f_out = sys.stdout

    bleu_tup = bleu_score.compute_bleu([[x] for x in c_ref], c_hyp, smooth=False) # smooth=False
    bleu = bleu_tup[0]
    exact = sum([1 if h == r else 0 for h, r in zip(c_hyp, c_ref)])/len(c_hyp)

    f_out.write('bleu:{0:.2f}\n'.format(bleu * 100))
    if not args.no_exact_match:
        f_out.write('exact:{0:.2f}\n'.format(exact * 100))

    # f_out.close()  # original code to uncomment ...
    return bleu, exact
Exemple #4
0
def evaluate_bleu(reference_list, hypothesis_list):
    b = [tokenize_for_bleu_eval(s) for s in hypothesis_list]
    return bleu_score.compute_bleu(reference_list, b, smooth=False)
Exemple #5
0
    def evaluate_dataset(self, dataset, decode_results, fast_mode=False):
        examples = dataset
        assert len(examples) == len(decode_results)

        # speed up, cache tokenization results
        if not hasattr(examples[0], 'reference_code_tokens'):
            for example in examples:
                setattr(example, 'reference_code_tokens', tokenize_for_bleu_eval(example.info['example_dict']['snippet']))

        if not hasattr(decode_results[0][0], 'decanonical_code_tokens'):
            for i, example in enumerate(examples):
                hyp_list = decode_results[i]
                # here we prune any hypothesis that throws an error when converting back to the decanonical code!
                # This modifies the decode_results in-place!
                filtered_hyp_list = []
                for hyp in hyp_list:
                    if not hasattr(hyp, 'decanonical_code'):
                        try:
                            hyp.decanonical_code = decanonicalize_code(hyp.code, slot_map=example.info['slot_map'])
                            if hyp.decanonical_code:
                                hyp.decanonical_code_tokens = tokenize_for_bleu_eval(hyp.decanonical_code)
                                filtered_hyp_list.append(hyp)
                        except: pass

                decode_results[i] = filtered_hyp_list

        if fast_mode:
            references = [e.reference_code_tokens for e in examples]
            hypotheses = [hyp_list[0].decanonical_code_tokens if hyp_list else [] for hyp_list in decode_results]

            bleu_tup = compute_bleu([[x] for x in references], hypotheses, smooth=False)
            bleu = bleu_tup[0]

            return bleu
        else:
            tokenized_ref_snippets = []
            hyp_code_tokens = []
            best_hyp_code_tokens = []
            sm_func = SmoothingFunction().method3
            sent_bleu_scores = []
            oracle_bleu_scores = []
            oracle_exact_match = []
            for example, hyp_list in zip(examples, decode_results):
                tokenized_ref_snippets.append(example.reference_code_tokens)
                example_hyp_bleu_scores = []
                if hyp_list:
                    for i, hyp in enumerate(hyp_list):
                        hyp.bleu_score = sentence_bleu([example.reference_code_tokens],
                                                       hyp.decanonical_code_tokens,
                                                       smoothing_function=sm_func)
                        hyp.is_correct = self.is_hyp_correct(example, hyp)

                        example_hyp_bleu_scores.append(hyp.bleu_score)

                    top_decanonical_code_tokens = hyp_list[0].decanonical_code_tokens
                    sent_bleu_score = hyp_list[0].bleu_score

                    best_hyp_idx = np.argmax(example_hyp_bleu_scores)
                    oracle_sent_bleu = example_hyp_bleu_scores[best_hyp_idx]
                    _best_hyp_code_tokens = hyp_list[best_hyp_idx].decanonical_code_tokens
                else:
                    top_decanonical_code_tokens = []
                    sent_bleu_score = 0.
                    oracle_sent_bleu = 0.
                    _best_hyp_code_tokens = []

                oracle_exact_match.append(any(hyp.is_correct for hyp in hyp_list))
                hyp_code_tokens.append(top_decanonical_code_tokens)
                sent_bleu_scores.append(sent_bleu_score)
                oracle_bleu_scores.append(oracle_sent_bleu)
                best_hyp_code_tokens.append(_best_hyp_code_tokens)

            bleu_tup = compute_bleu([[x] for x in tokenized_ref_snippets], hyp_code_tokens, smooth=False)
            corpus_bleu = bleu_tup[0]

            bleu_tup = compute_bleu([[x] for x in tokenized_ref_snippets], best_hyp_code_tokens, smooth=False)
            oracle_corpus_bleu = bleu_tup[0]

            avg_sent_bleu = np.average(sent_bleu_scores)
            oracle_avg_sent_bleu = np.average(oracle_bleu_scores)
            exact = sum([1 if h == r else 0 for h, r in zip(hyp_code_tokens, tokenized_ref_snippets)]) / float(
                len(examples))
            oracle_exact_match = np.average(oracle_exact_match)

            return {'corpus_bleu': corpus_bleu,
                    'oracle_corpus_bleu': oracle_corpus_bleu,
                    'avg_sent_bleu': avg_sent_bleu,
                    'oracle_avg_sent_bleu': oracle_avg_sent_bleu,
                    'exact_match': exact,
                    'oracle_exact_match': oracle_exact_match}
Exemple #6
0
def pure_bleu(reference, candidate):
    return compute_bleu(
        [[tokenize_builtin(reference)]], [tokenize_builtin(candidate)]
    )[0]  #for some reason we need this exact amount of brackets for compute_bleu to work; I don't fully understand why and this might be related to the issue with the wrong BLEU computation