def one_match(asts, ast2, rewrite=True, ignore_arg_value=False): if rewrite: with er.DBConnection() as db: ast_rewrites = get_rewrites(asts, db) else: ast_rewrites = asts cmd2 = ignore_differences(data_tools.ast2template( ast2, loose_constraints=True, arg_type_only=ignore_arg_value)) for ast1 in ast_rewrites: cmd1 = data_tools.ast2template( ast1, loose_constraints=True, arg_type_only=ignore_arg_value) cmd1 = ignore_differences(cmd1) if cmd1 == cmd2: return True return False
def manual_eval(model, dataset, rev_nl_vocab, FLAGS, output_dir, num_eval=30): num_top1_correct_temp = 0.0 num_top3_correct_temp = 0.0 num_top5_correct_temp = 0.0 num_top10_correct_temp = 0.0 num_top1_correct = 0.0 num_top3_correct = 0.0 num_top5_correct = 0.0 num_top10_correct = 0.0 num_evaled = 0 grouped_dataset = data_utils.group_data_by_nl(dataset, use_bucket=False) \ .values() random.shuffle(grouped_dataset, lambda: 0.5208484091114275) cmd_parser = data_tools.bash_parser if FLAGS.dataset == "bash" \ else data_utils.parse_brackets o_f = open(os.path.join(output_dir, "manual.eval.results"), 'w') rejudge = False with DBConnection() as db: db.create_schema() while num_evaled < len(grouped_dataset): nl_strs, cm_strs, nls, search_historys = grouped_dataset[num_evaled] nl_str = nl_strs[0].decode('utf-8') if num_evaled == num_eval: break gt_trees = [cmd_parser(cmd) for cmd in cm_strs] predictions = db.get_top_k_predictions(model, nl_str, k=10) top1_correct_temp, top3_correct_temp, top5_correct_temp, top10_correct_temp = \ False, False, False, False top1_correct, top3_correct, top5_correct, top10_correct = \ False, False, False, False # evaluation ignoring ordering of flags print("Example %d (%d)" % (num_evaled+1, len(cm_strs))) o_f.write("Example %d (%d)" % (num_evaled+1, len(cm_strs)) + "\n") print("English: " + nl_str.strip()) o_f.write("English: " + nl_str.encode('utf-8')) for j in xrange(len(cm_strs)): print("GT Command %d: " % (j+1) + cm_strs[j].strip()) o_f.write("GT Command %d: " % (j+1) + cm_strs[j].strip() + "\n") pred_id = 0 while pred_id < min(1, len(predictions)): pred_cmd, score = predictions[pred_id] tree = cmd_parser(pred_cmd) print("Prediction {}: {} ({})".format(pred_id+1, pred_cmd, score)) o_f.write("Prediction {}: {} ({})\n".format(pred_id+1, pred_cmd, score)) print() pred_temp = data_tools.ast2template(tree, loose_constraints=True) str_judge = db.get_str_judgement((nl_str, pred_cmd)) temp_judge = db.get_temp_judgement((nl_str, pred_temp)) if temp_judge is not None and not rejudge: judgement_str = "y" if temp_judge == 1 \ else "n ({})".format(error_types[temp_judge]) print("Correct template [y/n]: %s" % judgement_str) else: temp_judge = ast_based.one_match(gt_trees, tree, rewrite=False, ignore_arg_value=True) if not temp_judge: inp = raw_input("Correct template [y/n]: ") if inp == "REVERSE": rejudge = True else: if inp == "y": temp_judge = True db.add_temp_judgement((nl_str, pred_temp, 1)) else: temp_judge = False error_type = raw_input( "Error type: \n" "(2) extra utility \n" "(3) missing utility \n" "(4) confused utility \n" "(5) extra flag \n" "(6) missing flag \n" "(7) confused flag \n" "(8) logic error\n" "(9) count error\n" ) db.add_temp_judgement((nl_str, pred_temp, int(error_type))) rejudge = False else: print("Correct template [y/n]: y") if temp_judge == 1: if pred_id < 1: top1_correct_temp = True top3_correct_temp = True top5_correct_temp = True top10_correct_temp = True elif pred_id < 3: top3_correct_temp = True top5_correct_temp = True top10_correct_temp = True elif pred_id < 5: top5_correct_temp = True top10_correct_temp = True elif pred_id < 10: top10_correct_temp = True o_f.write("C") if str_judge is not None and not rejudge: judgement_str = "y" if str_judge == 1 \ else "n ({})".format(error_types[str_judge]) print("Correct command [y/n]: %s" % judgement_str) else: str_judge = ast_based.one_match(gt_trees, tree, rewrite=False, ignore_arg_value=False) if not str_judge: inp = raw_input("Correct command [y/n]: ") if inp == "REVERSE": rejudge = True continue elif inp == "y": str_judge = True o_f.write("C") db.add_str_judgement((nl_str, pred_cmd, 1)) else: str_judge = False o_f.write("W") db.add_str_judgement((nl_str, pred_cmd, 0)) else: print("Correct command [y/n]: y") if str_judge == 1: if pred_id < 1: top1_correct = True top3_correct = True top5_correct = True top10_correct = True elif pred_id < 3: top3_correct = True top5_correct = True top10_correct = True elif pred_id < 5: top5_correct = True top10_correct = True elif pred_id < 10: top10_correct = True o_f.write("C") else: o_f.write("W") else: o_f.write("WW") o_f.write("\n") o_f.write("\n") pred_id += 1 if rejudge: num_evaled -= 1 else: num_evaled += 1 if top1_correct_temp: num_top1_correct_temp += 1 if top3_correct_temp: num_top3_correct_temp += 1 if top5_correct_temp: num_top5_correct_temp += 1 if top10_correct_temp: num_top10_correct_temp += 1 if top1_correct: num_top1_correct += 1 if top3_correct: num_top3_correct += 1 if top5_correct: num_top5_correct += 1 if top10_correct: num_top10_correct += 1 rejudge = False print() print("%d examples evaluated" % num_eval) print("Top 1 Template Match Score = %.2f" % (num_top1_correct_temp/num_eval)) print("Top 1 String Match Score = %.2f" % (num_top1_correct/num_eval)) if len(predictions) > 3: print("Top 5 Template Match Score = %.2f" % (num_top5_correct_temp/num_eval)) print("Top 5 String Match Score = %.2f" % (num_top5_correct/num_eval)) print("Top 10 Template Match Score = %.2f" % (num_top10_correct_temp/num_eval)) print("Top 10 String Match Score = %.2f" % (num_top10_correct/num_eval)) print() o_f.write("%d examples evaluated" % num_eval + "\n") o_f.write("Top 1 Template MatchScore = %.2f" % (num_top1_correct_temp/num_eval) + "\n") o_f.write("Top 1 String Match Score = %.2f" % (num_top1_correct/num_eval) + "\n") if len(predictions) > 1: o_f.write("Top 5 Template Match Score = %.2f" % (num_top5_correct_temp/num_eval) + "\n") o_f.write("Top 5 String Match Score = %.2f" % (num_top5_correct/num_eval) + "\n") o_f.write("Top 10 Template Match Score = %.2f" % (num_top10_correct_temp/num_eval) + "\n") o_f.write("Top 10 String Match Score = %.2f" % (num_top10_correct/num_eval) + "\n") o_f.write("\n")
def string_match(ast1, ast2): str1 = ignore_differences( data_tools.ast2template(ast1, loose_constraints=True, arg_type_only=False)) str2 = ignore_differences( data_tools.ast2template(ast2, loose_constraints=True, arg_type_only=False)) return str1 == str2
def template_match(ast1, ast2): temp1 = ignore_differences( data_tools.ast2template(ast1, loose_constraints=True)) temp2 = ignore_differences( data_tools.ast2template(ast2, loose_constraints=True)) return temp1 == temp2