def add_judgement(data_dir, nl, command, correct_template='', correct_command=''): """ Append a new judgement """ data_dir = os.path.join(data_dir, 'manual_judgements') manual_judgement_path = os.path.join(data_dir, 'manual.evaluations.author') if not os.path.exists(manual_judgement_path): with open(manual_judgement_path, 'w') as o_f: o_f.write( 'description,prediction,template,correct template,correct command\n' ) with open(manual_judgement_path, 'a') as o_f: temp = data_tools.cmd2template(command, loose_constraints=True) if not correct_template: correct_template = 'n' if not correct_command: correct_command = 'n' o_f.write('"{}","{}","{}","{}","{}"\n'.format( nl.replace('"', '""'), command.replace('"', '""'), temp.replace('"', '""'), correct_template.replace('"', '""'), correct_command.replace('"', '""'))) print('new judgement added to {}'.format(manual_judgement_path))
def load_cached_evaluations_from_file(input_file, treat_empty_as_correct=False, verbose=True): structure_eval_results = {} command_eval_results = {} with open(input_file, encoding='utf-8') as f: if verbose: print('reading cached evaluations from {}'.format(input_file)) reader = csv.DictReader(f) current_nl_key = '' for row in reader: if row['description']: current_nl_key = get_example_nl_key(row['description']) pred_cmd = row['prediction'] if 'template' in row: pred_temp = row['template'] else: pred_temp = data_tools.cmd2template(pred_cmd, loose_constraints=True) command_eval = row['correct command'] if treat_empty_as_correct: command_eval = normalize_judgement(command_eval) command_example_key = '{}<NL_PREDICTION>{}'.format( current_nl_key, pred_cmd) if command_eval: command_eval_results[command_example_key] = command_eval structure_eval = row['correct template'] if treat_empty_as_correct: structure_eval = normalize_judgement(structure_eval) structure_example_key = '{}<NL_PREDICTION>{}'.format( current_nl_key, pred_temp) if structure_eval: structure_eval_results[structure_example_key] = structure_eval return structure_eval_results, command_eval_results
def compute_cm_stats(): input_file = sys.argv[1] unique_commands = set() unique_templates = set() unique_tokens = set() tokens_per_cmd = [] cmds_per_token = collections.defaultdict(int) with open(input_file) as f: for line in f: cm = line.strip() unique_commands.add(cm) temp = data_tools.cmd2template(cm, loose_constraints=True) unique_templates.add(temp) tokens = data_tools.bash_tokenizer(cm, loose_constraints=True) unique_tokens |= set(tokens) tokens_per_cmd.append(len(tokens)) for token in tokens: cmds_per_token[token] += 1 print('# unique commands: {}'.format(len(unique_commands))) print('# unique templates: {}'.format(len(unique_templates))) print('# unique tokens: {}'.format(len(unique_tokens))) print('# tokens per command: average {}, median {}'.format( np.mean(tokens_per_cmd), np.median(tokens_per_cmd))) print('# commands per token: average {}, median {}'.format( np.mean(cmds_per_token.values()), np.median(cmds_per_token.values())))
def extract_rewrites(data): """Extract all pairs of rewrites from a parallel corpus.""" nls, cms = data # Step 1: group pairs with the same natural language description. group_pairs_by_nl = collections.defaultdict(set) for nl, cm in zip(nls, cms): nl = nl.strip() cm = cm.strip() if nl.lower() == "na": continue if not nl: continue if not cm: continue nl_tokens, _ = tokenizer.ner_tokenizer(nl) nl_temp = ' '.join(nl_tokens) cm_temp = data_tools.cmd2template(cm) if not cm_temp in group_pairs_by_nl[nl_temp]: group_pairs_by_nl[nl_temp].add(cm_temp) # Step 2: cluster the commands with the same natural language explanations. merged = set() nls = group_pairs_by_nl.keys() for i in xrange(len(nls)): nl = nls[i] cm_temp_set = group_pairs_by_nl[nl] for j in xrange(i + 1, len(nls)): nl2 = nls[j] cm_temp_set2 = group_pairs_by_nl[nl2] if len(cm_temp_set & cm_temp_set2) >= 2: for cm_temp in cm_temp_set: if not cm_temp in group_pairs_by_nl[nl2]: group_pairs_by_nl[nl2].add(cm_temp) merged.add(i) # Step 3: remove redundant clusters after merge. rewrites = {} for i in xrange(len(nls)): if not i in merged: rewrites[nls[i]] = group_pairs_by_nl[nls[i]] # Step 4: print extracted rewrites and store in database. with DBConnection() as db: db.create_schema() for nl, cm_temps in sorted(rewrites.items(), key=lambda x: len(x[1]), reverse=True)[:10]: if len(cm_temps) >= 2: for cm_temp1 in cm_temps: for cm_temp2 in cm_temps: if cm_temp1 == cm_temp2: continue if not db.exist_rewrite((cm_temp1, cm_temp2)): db.add_rewrite((cm_temp1, cm_temp2)) print("* {} --> {}".format(cm_temp1, cm_temp2)) print()
def load_cached_correct_translations(data_dir, treat_empty_as_correct=False, verbose=False): """ Load cached correct translations from disk. :return: nl -> template translation map, nl -> command translation map """ command_translations = collections.defaultdict(set) template_translations = collections.defaultdict(set) eval_files = [] for file_name in os.listdir(data_dir): if 'evaluations' in file_name and not file_name.endswith('base'): eval_files.append(file_name) for file_name in sorted(eval_files): manual_judgement_path = os.path.join(data_dir, file_name) with open(manual_judgement_path) as f: if verbose: print('reading cached evaluations from {}'.format( manual_judgement_path)) reader = csv.DictReader(f) current_nl_key = '' for row in reader: if row['description']: current_nl_key = get_example_nl_key(row['description']) pred_cmd = row['prediction'] if 'template' in row: pred_temp = row['template'] else: pred_temp = data_tools.cmd2template(pred_cmd, loose_constraints=True) structure_eval = row['correct template'] if treat_empty_as_correct: structure_eval = normalize_judgement(structure_eval) command_eval = row['correct command'] if treat_empty_as_correct: command_eval = normalize_judgement(command_eval) if structure_eval == 'y': template_translations[current_nl_key].add(pred_temp) if command_eval == 'y': command_translations[current_nl_key].add(pred_cmd) print('{} template translations loaded'.format(len(template_translations))) print('{} command translations loaded'.format(len(command_translations))) return template_translations, command_translations
def get_automatic_evaluation_metrics(grouped_dataset, prediction_list, vocabs, FLAGS, top_k, num_samples=-1, verbose=False): cmd_parser = data_tools.bash_parser rev_sc_vocab = vocabs.rev_sc_vocab if vocabs is not None else None # Load cached evaluation results structure_eval_cache, command_eval_cache = \ load_cached_evaluations( os.path.join(FLAGS.data_dir, 'manual_judgements')) # Compute manual evaluation scores on a subset of examples if num_samples > 0: # Get FIXED dev set samples random.seed(100) example_ids = list(range(len(grouped_dataset))) random.shuffle(example_ids) sample_ids = example_ids[:100] grouped_dataset = [grouped_dataset[i] for i in sample_ids] prediction_list = [prediction_list[i] for i in sample_ids] num_eval = 0 top_k_temp_correct = np.zeros([len(grouped_dataset), top_k]) top_k_str_correct = np.zeros([len(grouped_dataset), top_k]) top_k_cms = np.zeros([len(grouped_dataset), top_k]) top_k_bleu = np.zeros([len(grouped_dataset), top_k]) command_gt_asts_list, pred_ast_list = [], [] for data_id in xrange(len(grouped_dataset)): _, data_group = grouped_dataset[data_id] sc_str = data_group[0].sc_txt.strip() sc_key = get_example_nl_key(sc_str) if vocabs is not None: sc_tokens = [rev_sc_vocab[i] for i in data_group[0].sc_ids] if FLAGS.channel == 'char': sc_features = ''.join(sc_tokens) sc_features = sc_features.replace(constants._SPACE, ' ') else: sc_features = ' '.join(sc_tokens) command_gts = [dp.tg_txt.strip() for dp in data_group] command_gt_asts = [cmd_parser(cmd) for cmd in command_gts] command_gt_asts_list.append(command_gt_asts) template_gts = [ data_tools.cmd2template(cmd, loose_constraints=True) for cmd in command_gts ] template_gt_asts = [cmd_parser(temp) for temp in template_gts] if verbose: print("Example {}".format(data_id)) print("Original Source: {}".format(sc_str.encode('utf-8'))) if vocabs is not None: print("Source: {}".format( [x.encode('utf-8') for x in sc_features])) for j, command_gt in enumerate(command_gts): print("GT Target {}: {}".format( j + 1, command_gt.strip().encode('utf-8'))) num_eval += 1 predictions = prediction_list[data_id] for i in xrange(len(predictions)): pred_cmd = predictions[i] pred_ast = cmd_parser(pred_cmd) if i == 0: pred_ast_list.append(pred_ast) pred_temp = data_tools.cmd2template(pred_cmd, loose_constraints=True) # A) Exact match with ground truths & exisitng judgements command_example_key = '{}<NL_PREDICTION>{}'.format( sc_key, pred_cmd) structure_example_key = '{}<NL_PREDICTION>{}'.format( sc_key, pred_temp) # B) Match ignoring flag orders temp_match = tree_dist.one_match(template_gt_asts, pred_ast, ignore_arg_value=True) str_match = tree_dist.one_match(command_gt_asts, pred_ast, ignore_arg_value=False) if command_eval_cache and command_example_key in command_eval_cache: str_match = normalize_judgement( command_eval_cache[command_example_key]) == 'y' if structure_eval_cache and structure_example_key in structure_eval_cache: temp_match = normalize_judgement( structure_eval_cache[structure_example_key]) == 'y' if temp_match: top_k_temp_correct[data_id, i] = 1 if str_match: top_k_str_correct[data_id, i] = 1 cms = token_based.command_match_score(command_gt_asts, pred_ast) # if pred_cmd.strip(): # bleu = token_based.sentence_bleu_score(command_gt_asts, pred_ast) # else: # bleu = 0 bleu = nltk.translate.bleu_score.sentence_bleu( command_gts, pred_cmd) top_k_cms[data_id, i] = cms top_k_bleu[data_id, i] = bleu if verbose: print("Prediction {}: {} ({}, {})".format( i + 1, pred_cmd, cms, bleu)) if verbose: print() bleu = token_based.corpus_bleu_score(command_gt_asts_list, pred_ast_list) top_temp_acc = [-1 for _ in [1, 3, 5, 10]] top_cmd_acc = [-1 for _ in [1, 3, 5, 10]] top_cms = [-1 for _ in [1, 3, 5, 10]] top_bleu = [-1 for _ in [1, 3, 5, 10]] top_temp_acc[0] = top_k_temp_correct[:, 0].mean() top_cmd_acc[0] = top_k_str_correct[:, 0].mean() top_cms[0] = top_k_cms[:, 0].mean() top_bleu[0] = top_k_bleu[:, 0].mean() print("{} examples evaluated".format(num_eval)) print("Top 1 Template Acc = %.3f" % top_temp_acc[0]) print("Top 1 Command Acc = %.3f" % top_cmd_acc[0]) print("Average top 1 Template Match Score = %.3f" % top_cms[0]) print("Average top 1 BLEU Score = %.3f" % top_bleu[0]) if len(predictions) > 1: top_temp_acc[1] = np.max(top_k_temp_correct[:, :3], 1).mean() top_cmd_acc[1] = np.max(top_k_str_correct[:, :3], 1).mean() top_cms[1] = np.max(top_k_cms[:, :3], 1).mean() top_bleu[1] = np.max(top_k_bleu[:, :3], 1).mean() print("Top 3 Template Acc = %.3f" % top_temp_acc[1]) print("Top 3 Command Acc = %.3f" % top_cmd_acc[1]) print("Average top 3 Template Match Score = %.3f" % top_cms[1]) print("Average top 3 BLEU Score = %.3f" % top_bleu[1]) if len(predictions) > 3: top_temp_acc[2] = np.max(top_k_temp_correct[:, :5], 1).mean() top_cmd_acc[2] = np.max(top_k_str_correct[:, :5], 1).mean() top_cms[2] = np.max(top_k_cms[:, :5], 1).mean() top_bleu[2] = np.max(top_k_bleu[:, :5], 1).mean() print("Top 5 Template Acc = %.3f" % top_temp_acc[2]) print("Top 5 Command Acc = %.3f" % top_cmd_acc[2]) print("Average top 5 Template Match Score = %.3f" % top_cms[2]) print("Average top 5 BLEU Score = %.3f" % top_bleu[2]) if len(predictions) > 5: top_temp_acc[3] = np.max(top_k_temp_correct[:, :10], 1).mean() top_cmd_acc[3] = np.max(top_k_str_correct[:, :10], 1).mean() top_cms[3] = np.max(top_k_cms[:, :10], 1).mean() top_bleu[3] = np.max(top_k_bleu[:, :10], 1).mean() print("Top 10 Template Acc = %.3f" % top_temp_acc[3]) print("Top 10 Command Acc = %.3f" % top_cmd_acc[3]) print("Average top 10 Template Match Score = %.3f" % top_cms[3]) print("Average top 10 BLEU Score = %.3f" % top_bleu[3]) print('Corpus BLEU = %.3f' % bleu) print() metrics = {} metrics['acc_f'] = top_cmd_acc metrics['acc_t'] = top_temp_acc metrics['cms'] = top_cms metrics['bleu'] = top_bleu return metrics
def combine_annotations(): """ Combine the annotations input by three annotators. :param input_file1: main annotation file 1. :param input_file2: main annotation file 2 (should contain the same number of lines as input_file1). :param input_file3: supplementary annotation file which contains annotations of lines in input_file1 and input_file2 that contain a disagreement. :param output_file: file that contains the combined annotations. """ input_file1 = sys.argv[1] input_file2 = sys.argv[2] input_file3 = sys.argv[3] output_file = sys.argv[4] o_f = open(output_file, 'w') o_f.write('description,prediction,template,correct template,correct command,' 'correct template A,correct command A,' 'correct template B,correct command B,' 'correct template C,correct command C\n') sup_structure_eval, sup_command_eval = load_cached_evaluations_from_file( input_file3, treat_empty_as_correct=True) # for key in sup_structure_eval: # print(key) # print('------------------') with open(input_file1) as f1: with open(input_file2) as f2: reader1 = csv.DictReader(f1) reader2 = csv.DictReader(f2) current_desp = '' for row1, row2 in zip(reader1, reader2): row1_template_eval = normalize_judgement(row1['correct template'].strip()) row1_command_eval = normalize_judgement(row1['correct command'].strip()) row2_template_eval = normalize_judgement(row2['correct template'].strip()) row2_command_eval = normalize_judgement(row2['correct command'].strip()) if row1['description']: current_desp = row1['description'].strip() sc_key = get_example_nl_key(current_desp) pred_cmd = row1['prediction'].strip() if not pred_cmd: row1_template_eval, row1_command_eval = 'n', 'n' row2_template_eval, row2_command_eval = 'n', 'n' pred_temp = data_tools.cmd2template(pred_cmd, loose_constraints=True) structure_example_key = '{}<NL_PREDICTION>{}'.format(sc_key, pred_temp) command_example_key = '{}<NL_PREDICTION>{}'.format(sc_key, pred_cmd) row3_template_eval, row3_command_eval = None, None if structure_example_key in sup_structure_eval: row3_template_eval = sup_structure_eval[structure_example_key] if command_example_key in sup_command_eval: row3_command_eval = sup_command_eval[command_example_key] if row1_template_eval != row2_template_eval or row1_command_eval != row2_command_eval: if row1_template_eval != row2_template_eval: if row3_template_eval is None: print(structure_example_key) assert(row3_template_eval is not None) template_eval = row3_template_eval else: template_eval = row1_template_eval if row1_command_eval != row2_command_eval: # if row3_command_eval is None: # print(command_example_key) assert(row3_command_eval is not None) command_eval = row3_command_eval else: command_eval = row1_command_eval else: template_eval = row1_template_eval command_eval = row1_command_eval if row3_template_eval is None: row3_template_eval = '' if row3_command_eval is None: row3_command_eval = '' o_f.write('"{}","{}","{}",{},{},{},{},{},{},{},{}\n'.format( current_desp.replace('"', '""'), pred_cmd.replace('"', '""'), pred_temp.replace('"', '""'), template_eval, command_eval, row1_template_eval, row1_command_eval, row2_template_eval, row2_command_eval, row3_template_eval, row3_command_eval)) o_f.close()
def print_error_analysis_sheet(): input_file1 = sys.argv[1] input_file2 = sys.argv[2] input_file3 = sys.argv[3] output_file = sys.argv[4] o_f = open(output_file, 'w') o_f.write('description,model,prediction,correct template,correct command,' 'correct template A,correct command A,' 'correct template B,correct command B,' 'correct template C,correct command C\n') sup_structure_eval, sup_command_eval = load_cached_evaluations_from_file( input_file3, treat_empty_as_correct=True) # for key in sup_structure_eval: # print(key) # print('------------------') with open(input_file1) as f1: with open(input_file2) as f2: reader1 = csv.DictReader(f1) reader2 = csv.DictReader(f2) current_desp = '' for row_id, (row1, row2) in enumerate(zip(reader1, reader2)): if row1['description']: current_desp = row1['description'].strip() model_name = row2['model'] if not model_name in ['partial.token-copynet', 'tellina']: continue if row_id % 3 != 0: continue row1_template_eval = normalize_judgement(row1['correct template'].strip()) row1_command_eval = normalize_judgement(row1['correct command'].strip()) row2_template_eval = normalize_judgement(row2['correct template'].strip()) row2_command_eval = normalize_judgement(row2['correct command'].strip()) sc_key = get_example_nl_key(current_desp) pred_cmd = row1['prediction'].strip() if not pred_cmd: row1_template_eval, row1_command_eval = 'n', 'n' row2_template_eval, row2_command_eval = 'n', 'n' pred_temp = data_tools.cmd2template(pred_cmd, loose_constraints=True) structure_example_key = '{}<NL_PREDICTION>{}'.format(sc_key, pred_temp) command_example_key = '{}<NL_PREDICTION>{}'.format(sc_key, pred_cmd) row3_template_eval, row3_command_eval = None, None if structure_example_key in sup_structure_eval: row3_template_eval = sup_structure_eval[structure_example_key] if command_example_key in sup_command_eval: row3_command_eval = sup_command_eval[command_example_key] if row1_template_eval != row2_template_eval or row1_command_eval != row2_command_eval: if row1_template_eval != row2_template_eval: if row3_template_eval is None: print(pred_cmd_key, structure_example_key) assert (row3_template_eval is not None) template_eval = row3_template_eval else: template_eval = row1_template_eval if row1_command_eval != row2_command_eval: # if row3_command_eval is None: # print(command_example_key) assert (row3_command_eval is not None) command_eval = row3_command_eval else: command_eval = row1_command_eval else: template_eval = row1_template_eval command_eval = row1_command_eval if row3_template_eval is None: row3_template_eval = '' if row3_command_eval is None: row3_command_eval = '' o_f.write('"{}","{}","{}",{},{},{},{},{},{},{},{}\n'.format( current_desp.replace('"', '""'), model_name, pred_cmd.replace('"', '""'), template_eval, command_eval, row1_template_eval, row1_command_eval, row2_template_eval, row2_command_eval, row3_template_eval, row3_command_eval)) o_f.close()
def gen_manual_evaluation_csv_single_model(dataset, FLAGS): """ Generate .csv spreadsheet for manual evaluation on dev/test set examples for a specific model. """ # Group dataset tokenizer_selector = "cm" if FLAGS.explain else "nl" grouped_dataset = data_utils.group_parallel_data( dataset, use_bucket=True, tokenizer_selector=tokenizer_selector) # Load model predictions model_subdir, decode_sig = graph_utils.get_decode_signature(FLAGS) model_dir = os.path.join(FLAGS.model_root_dir, model_subdir) prediction_list = load_predictions(model_dir, decode_sig, top_k=3) if len(grouped_dataset) != len(prediction_list): raise ValueError("ground truth list and prediction list length must " "be equal: {} vs. {}".format(len(grouped_dataset), len(prediction_list))) # Load additional ground truths template_translations, command_translations = load_cached_correct_translations( FLAGS.data_dir) # Load cached evaluation results structure_eval_cache, command_eval_cache = load_cached_evaluations( os.path.join(FLAGS.data_dir, 'manual_judgements')) eval_bash = FLAGS.dataset.startswith("bash") cmd_parser = data_tools.bash_parser if eval_bash else data_tools.paren_parser output_path = os.path.join(model_dir, 'manual.evaluations.single.model') with open(output_path, 'w') as o_f: # write spreadsheet header o_f.write('id,description,command,correct template,correct command\n') for example_id in range(len(grouped_dataset)): data_group = grouped_dataset[example_id][1] sc_txt = data_group[0].sc_txt.strip() sc_key = get_example_nl_key(sc_txt) command_gts = [dp.tg_txt for dp in data_group] command_gts = set(command_gts + command_translations[sc_key]) command_gt_asts = [ data_tools.bash_parser(cmd) for cmd in command_gts ] template_gts = [ data_tools.cmd2template(cmd, loose_constraints=True) for cmd in command_gts ] template_gts = set(template_gts + template_translations[sc_key]) template_gt_asts = [ data_tools.bash_parser(temp) for temp in template_gts ] predictions = prediction_list[example_id] for i in xrange(3): if i >= len(predictions): o_f.write(',,,n,n\n') continue pred_cmd = predictions[i] pred_tree = cmd_parser(pred_cmd) pred_temp = data_tools.ast2template(pred_tree, loose_constraints=True) temp_match = tree_dist.one_match(template_gt_asts, pred_tree, ignore_arg_value=True) str_match = tree_dist.one_match(command_gt_asts, pred_tree, ignore_arg_value=False) # Match ground truths & exisitng judgements command_example_sig = '{}<NL_PREDICTION>{}'.format( sc_key, pred_cmd) structure_example_sig = '{}<NL_PREDICTION>{}'.format( sc_key, pred_temp) command_eval, structure_eval = '', '' if str_match: command_eval = 'y' structure_eval = 'y' elif temp_match: structure_eval = 'y' if command_eval_cache and \ command_example_sig in command_eval_cache: command_eval = command_eval_cache[command_example_sig] if structure_eval_cache and \ structure_example_sig in structure_eval_cache: structure_eval = structure_eval_cache[ structure_example_sig] if i == 0: o_f.write('{},"{}","{}",{},{}\n'.format( example_id, sc_txt.replace('"', '""'), pred_cmd.replace('"', '""'), structure_eval, command_eval)) else: o_f.write(',,"{}",{},{}\n'.format( pred_cmd.replace('"', '""'), structure_eval, command_eval)) print('manual evaluation spreadsheet saved to {}'.format(output_path))