def bleu_similarity(s, t): return compute_bleu([[s]], [t])[0]
s = remove_tags(s) t = remove_tags(t) if len(s) == 0 or len(t) == 0: return 0 bigrams_s = list(zip(s[:-1], s[1:])) bigram_t = list(zip(t[:-1], t[1:])) if len(bigrams_s) == 0: return 0 overlap = 0 for bigram in bigrams_s: overlap += 1 if bigram in bigram_t else 0 return overlap / len(bigrams_s) def bleu_similarity(s, t): return compute_bleu([[s]], [t])[0] if __name__ == "__main__": print("Test") s = [1, 4, 140, 36, 6, 4, 31, 28, 4, 163, 2] t = [1, 4, 140, 36, 6, 4, 31, 28, 4, 3, 2] print(bleu_similarity(s, t)) print(compute_bleu([[s]], [t])[0]) print("Bigrams") print(bigram_overlap(s, t))
def main(argv_custom): #argv_custom): sys.argv = argv_custom # Nic change ... p = argparse.ArgumentParser(description="Evaluator for CoNaLa", formatter_class=argparse.ArgumentDefaultsHelpFormatter) p.add_argument("--input_dir", help="input directory, containing 'res/answer.txt' and 'ref/truth.txt'", default=None) p.add_argument("--input_ref", help="input reference file", default=None) p.add_argument("--input_hyp", help="input hypothesis file", default=None) p.add_argument("--output_file", help="output score file", default=None) p.add_argument("--output_dir", help="output score directory which will contain output_dir/scores.txt", default=None) p.add_argument("--no_exact_match", help="only output bleu scores and not exact_match score", action="store_true") p.add_argument("--strip_ref_metadata", help="strip metadata from the reference and get only the code", action="store_true") #args = sys.argv args = p.parse_args() if not (args.input_dir or (args.input_ref and args.input_hyp)): raise ValueError("Must specify input_dir or input_ref+input_hyp") input_hyp = args.input_hyp if args.input_hyp else os.path.join(args.input_dir, 'res', 'answer.txt') input_ref = args.input_ref if args.input_ref else os.path.join(args.input_dir, 'ref', 'truth.txt') with open(input_hyp, 'r') as f_hyp: c_hyp = json.load(f_hyp) c_hyp = [tokenize_for_bleu_eval(s) for s in c_hyp] with open(input_ref, 'r') as f_ref: c_ref = json.load(f_ref) if args.strip_ref_metadata: c_ref = [x['snippet'] for x in c_ref] c_ref = [tokenize_for_bleu_eval(s) for s in c_ref] if len(c_hyp) != len(c_ref): raise ValueError('Length of hypothesis and reference don\'t match: {} != {}'.format(len(c_hyp), len(c_ref))) if args.output_file: f_out = open(args.output_file, 'w') elif args.output_dir: f_out = open(os.path.join(args.output_dir, 'scores.txt'), 'w') else: f_out = sys.stdout bleu_tup = bleu_score.compute_bleu([[x] for x in c_ref], c_hyp, smooth=False) # smooth=False bleu = bleu_tup[0] exact = sum([1 if h == r else 0 for h, r in zip(c_hyp, c_ref)])/len(c_hyp) f_out.write('bleu:{0:.2f}\n'.format(bleu * 100)) if not args.no_exact_match: f_out.write('exact:{0:.2f}\n'.format(exact * 100)) # f_out.close() # original code to uncomment ... return bleu, exact
def evaluate_bleu(reference_list, hypothesis_list): b = [tokenize_for_bleu_eval(s) for s in hypothesis_list] return bleu_score.compute_bleu(reference_list, b, smooth=False)
def evaluate_dataset(self, dataset, decode_results, fast_mode=False): examples = dataset assert len(examples) == len(decode_results) # speed up, cache tokenization results if not hasattr(examples[0], 'reference_code_tokens'): for example in examples: setattr(example, 'reference_code_tokens', tokenize_for_bleu_eval(example.info['example_dict']['snippet'])) if not hasattr(decode_results[0][0], 'decanonical_code_tokens'): for i, example in enumerate(examples): hyp_list = decode_results[i] # here we prune any hypothesis that throws an error when converting back to the decanonical code! # This modifies the decode_results in-place! filtered_hyp_list = [] for hyp in hyp_list: if not hasattr(hyp, 'decanonical_code'): try: hyp.decanonical_code = decanonicalize_code(hyp.code, slot_map=example.info['slot_map']) if hyp.decanonical_code: hyp.decanonical_code_tokens = tokenize_for_bleu_eval(hyp.decanonical_code) filtered_hyp_list.append(hyp) except: pass decode_results[i] = filtered_hyp_list if fast_mode: references = [e.reference_code_tokens for e in examples] hypotheses = [hyp_list[0].decanonical_code_tokens if hyp_list else [] for hyp_list in decode_results] bleu_tup = compute_bleu([[x] for x in references], hypotheses, smooth=False) bleu = bleu_tup[0] return bleu else: tokenized_ref_snippets = [] hyp_code_tokens = [] best_hyp_code_tokens = [] sm_func = SmoothingFunction().method3 sent_bleu_scores = [] oracle_bleu_scores = [] oracle_exact_match = [] for example, hyp_list in zip(examples, decode_results): tokenized_ref_snippets.append(example.reference_code_tokens) example_hyp_bleu_scores = [] if hyp_list: for i, hyp in enumerate(hyp_list): hyp.bleu_score = sentence_bleu([example.reference_code_tokens], hyp.decanonical_code_tokens, smoothing_function=sm_func) hyp.is_correct = self.is_hyp_correct(example, hyp) example_hyp_bleu_scores.append(hyp.bleu_score) top_decanonical_code_tokens = hyp_list[0].decanonical_code_tokens sent_bleu_score = hyp_list[0].bleu_score best_hyp_idx = np.argmax(example_hyp_bleu_scores) oracle_sent_bleu = example_hyp_bleu_scores[best_hyp_idx] _best_hyp_code_tokens = hyp_list[best_hyp_idx].decanonical_code_tokens else: top_decanonical_code_tokens = [] sent_bleu_score = 0. oracle_sent_bleu = 0. _best_hyp_code_tokens = [] oracle_exact_match.append(any(hyp.is_correct for hyp in hyp_list)) hyp_code_tokens.append(top_decanonical_code_tokens) sent_bleu_scores.append(sent_bleu_score) oracle_bleu_scores.append(oracle_sent_bleu) best_hyp_code_tokens.append(_best_hyp_code_tokens) bleu_tup = compute_bleu([[x] for x in tokenized_ref_snippets], hyp_code_tokens, smooth=False) corpus_bleu = bleu_tup[0] bleu_tup = compute_bleu([[x] for x in tokenized_ref_snippets], best_hyp_code_tokens, smooth=False) oracle_corpus_bleu = bleu_tup[0] avg_sent_bleu = np.average(sent_bleu_scores) oracle_avg_sent_bleu = np.average(oracle_bleu_scores) exact = sum([1 if h == r else 0 for h, r in zip(hyp_code_tokens, tokenized_ref_snippets)]) / float( len(examples)) oracle_exact_match = np.average(oracle_exact_match) return {'corpus_bleu': corpus_bleu, 'oracle_corpus_bleu': oracle_corpus_bleu, 'avg_sent_bleu': avg_sent_bleu, 'oracle_avg_sent_bleu': oracle_avg_sent_bleu, 'exact_match': exact, 'oracle_exact_match': oracle_exact_match}
def pure_bleu(reference, candidate): return compute_bleu( [[tokenize_builtin(reference)]], [tokenize_builtin(candidate)] )[0] #for some reason we need this exact amount of brackets for compute_bleu to work; I don't fully understand why and this might be related to the issue with the wrong BLEU computation