コード例 #1
0
ファイル: eval_tools.py プロジェクト: dhuruvasaditya/Nlc2cmd
def add_judgement(data_dir,
                  nl,
                  command,
                  correct_template='',
                  correct_command=''):
    """
    Append a new judgement
    """
    data_dir = os.path.join(data_dir, 'manual_judgements')
    manual_judgement_path = os.path.join(data_dir, 'manual.evaluations.author')
    if not os.path.exists(manual_judgement_path):
        with open(manual_judgement_path, 'w') as o_f:
            o_f.write(
                'description,prediction,template,correct template,correct command\n'
            )
    with open(manual_judgement_path, 'a') as o_f:
        temp = data_tools.cmd2template(command, loose_constraints=True)
        if not correct_template:
            correct_template = 'n'
        if not correct_command:
            correct_command = 'n'
        o_f.write('"{}","{}","{}","{}","{}"\n'.format(
            nl.replace('"', '""'), command.replace('"', '""'),
            temp.replace('"', '""'), correct_template.replace('"', '""'),
            correct_command.replace('"', '""')))
    print('new judgement added to {}'.format(manual_judgement_path))
コード例 #2
0
ファイル: eval_tools.py プロジェクト: dhuruvasaditya/Nlc2cmd
def load_cached_evaluations_from_file(input_file,
                                      treat_empty_as_correct=False,
                                      verbose=True):
    structure_eval_results = {}
    command_eval_results = {}
    with open(input_file, encoding='utf-8') as f:
        if verbose:
            print('reading cached evaluations from {}'.format(input_file))
        reader = csv.DictReader(f)
        current_nl_key = ''
        for row in reader:
            if row['description']:
                current_nl_key = get_example_nl_key(row['description'])
            pred_cmd = row['prediction']
            if 'template' in row:
                pred_temp = row['template']
            else:
                pred_temp = data_tools.cmd2template(pred_cmd,
                                                    loose_constraints=True)
            command_eval = row['correct command']
            if treat_empty_as_correct:
                command_eval = normalize_judgement(command_eval)
            command_example_key = '{}<NL_PREDICTION>{}'.format(
                current_nl_key, pred_cmd)
            if command_eval:
                command_eval_results[command_example_key] = command_eval
            structure_eval = row['correct template']
            if treat_empty_as_correct:
                structure_eval = normalize_judgement(structure_eval)
            structure_example_key = '{}<NL_PREDICTION>{}'.format(
                current_nl_key, pred_temp)
            if structure_eval:
                structure_eval_results[structure_example_key] = structure_eval
    return structure_eval_results, command_eval_results
コード例 #3
0
ファイル: data_stats.py プロジェクト: dhuruvasaditya/Nlc2cmd
def compute_cm_stats():
    input_file = sys.argv[1]
    unique_commands = set()
    unique_templates = set()
    unique_tokens = set()
    tokens_per_cmd = []
    cmds_per_token = collections.defaultdict(int)
    with open(input_file) as f:
        for line in f:
            cm = line.strip()
            unique_commands.add(cm)
            temp = data_tools.cmd2template(cm, loose_constraints=True)
            unique_templates.add(temp)
            tokens = data_tools.bash_tokenizer(cm, loose_constraints=True)
            unique_tokens |= set(tokens)
            tokens_per_cmd.append(len(tokens))
            for token in tokens:
                cmds_per_token[token] += 1
    print('# unique commands: {}'.format(len(unique_commands)))
    print('# unique templates: {}'.format(len(unique_templates)))
    print('# unique tokens: {}'.format(len(unique_tokens)))
    print('# tokens per command: average {}, median {}'.format(
        np.mean(tokens_per_cmd), np.median(tokens_per_cmd)))
    print('# commands per token: average {}, median {}'.format(
        np.mean(cmds_per_token.values()), np.median(cmds_per_token.values())))
コード例 #4
0
def extract_rewrites(data):
    """Extract all pairs of rewrites from a parallel corpus."""
    nls, cms = data

    # Step 1: group pairs with the same natural language description.
    group_pairs_by_nl = collections.defaultdict(set)
    for nl, cm in zip(nls, cms):
        nl = nl.strip()
        cm = cm.strip()
        if nl.lower() == "na":
            continue
        if not nl:
            continue
        if not cm:
            continue
        nl_tokens, _ = tokenizer.ner_tokenizer(nl)
        nl_temp = ' '.join(nl_tokens)
        cm_temp = data_tools.cmd2template(cm)
        if not cm_temp in group_pairs_by_nl[nl_temp]:
            group_pairs_by_nl[nl_temp].add(cm_temp)

    # Step 2: cluster the commands with the same natural language explanations.
    merged = set()
    nls = group_pairs_by_nl.keys()
    for i in xrange(len(nls)):
        nl = nls[i]
        cm_temp_set = group_pairs_by_nl[nl]
        for j in xrange(i + 1, len(nls)):
            nl2 = nls[j]
            cm_temp_set2 = group_pairs_by_nl[nl2]
            if len(cm_temp_set & cm_temp_set2) >= 2:
                for cm_temp in cm_temp_set:
                    if not cm_temp in group_pairs_by_nl[nl2]:
                        group_pairs_by_nl[nl2].add(cm_temp)
                merged.add(i)

    # Step 3: remove redundant clusters after merge.
    rewrites = {}
    for i in xrange(len(nls)):
        if not i in merged:
            rewrites[nls[i]] = group_pairs_by_nl[nls[i]]

    # Step 4: print extracted rewrites and store in database.
    with DBConnection() as db:
        db.create_schema()
        for nl, cm_temps in sorted(rewrites.items(),
                                   key=lambda x: len(x[1]),
                                   reverse=True)[:10]:
            if len(cm_temps) >= 2:
                for cm_temp1 in cm_temps:
                    for cm_temp2 in cm_temps:
                        if cm_temp1 == cm_temp2:
                            continue
                        if not db.exist_rewrite((cm_temp1, cm_temp2)):
                            db.add_rewrite((cm_temp1, cm_temp2))
                            print("* {} --> {}".format(cm_temp1, cm_temp2))
                print()
コード例 #5
0
ファイル: eval_tools.py プロジェクト: dhuruvasaditya/Nlc2cmd
def load_cached_correct_translations(data_dir,
                                     treat_empty_as_correct=False,
                                     verbose=False):
    """
    Load cached correct translations from disk.

    :return: nl -> template translation map, nl -> command translation map
    """
    command_translations = collections.defaultdict(set)
    template_translations = collections.defaultdict(set)
    eval_files = []
    for file_name in os.listdir(data_dir):
        if 'evaluations' in file_name and not file_name.endswith('base'):
            eval_files.append(file_name)
    for file_name in sorted(eval_files):
        manual_judgement_path = os.path.join(data_dir, file_name)
        with open(manual_judgement_path) as f:
            if verbose:
                print('reading cached evaluations from {}'.format(
                    manual_judgement_path))
            reader = csv.DictReader(f)
            current_nl_key = ''
            for row in reader:
                if row['description']:
                    current_nl_key = get_example_nl_key(row['description'])
                pred_cmd = row['prediction']
                if 'template' in row:
                    pred_temp = row['template']
                else:
                    pred_temp = data_tools.cmd2template(pred_cmd,
                                                        loose_constraints=True)
                structure_eval = row['correct template']
                if treat_empty_as_correct:
                    structure_eval = normalize_judgement(structure_eval)
                command_eval = row['correct command']
                if treat_empty_as_correct:
                    command_eval = normalize_judgement(command_eval)
                if structure_eval == 'y':
                    template_translations[current_nl_key].add(pred_temp)
                if command_eval == 'y':
                    command_translations[current_nl_key].add(pred_cmd)
    print('{} template translations loaded'.format(len(template_translations)))
    print('{} command translations loaded'.format(len(command_translations)))

    return template_translations, command_translations
コード例 #6
0
ファイル: eval_tools.py プロジェクト: dhuruvasaditya/Nlc2cmd
def get_automatic_evaluation_metrics(grouped_dataset,
                                     prediction_list,
                                     vocabs,
                                     FLAGS,
                                     top_k,
                                     num_samples=-1,
                                     verbose=False):
    cmd_parser = data_tools.bash_parser
    rev_sc_vocab = vocabs.rev_sc_vocab if vocabs is not None else None

    # Load cached evaluation results
    structure_eval_cache, command_eval_cache = \
        load_cached_evaluations(
            os.path.join(FLAGS.data_dir, 'manual_judgements'))

    # Compute manual evaluation scores on a subset of examples
    if num_samples > 0:
        # Get FIXED dev set samples
        random.seed(100)
        example_ids = list(range(len(grouped_dataset)))
        random.shuffle(example_ids)
        sample_ids = example_ids[:100]
        grouped_dataset = [grouped_dataset[i] for i in sample_ids]
        prediction_list = [prediction_list[i] for i in sample_ids]

    num_eval = 0
    top_k_temp_correct = np.zeros([len(grouped_dataset), top_k])
    top_k_str_correct = np.zeros([len(grouped_dataset), top_k])
    top_k_cms = np.zeros([len(grouped_dataset), top_k])
    top_k_bleu = np.zeros([len(grouped_dataset), top_k])

    command_gt_asts_list, pred_ast_list = [], []

    for data_id in xrange(len(grouped_dataset)):
        _, data_group = grouped_dataset[data_id]
        sc_str = data_group[0].sc_txt.strip()
        sc_key = get_example_nl_key(sc_str)
        if vocabs is not None:
            sc_tokens = [rev_sc_vocab[i] for i in data_group[0].sc_ids]
            if FLAGS.channel == 'char':
                sc_features = ''.join(sc_tokens)
                sc_features = sc_features.replace(constants._SPACE, ' ')
            else:
                sc_features = ' '.join(sc_tokens)
        command_gts = [dp.tg_txt.strip() for dp in data_group]
        command_gt_asts = [cmd_parser(cmd) for cmd in command_gts]
        command_gt_asts_list.append(command_gt_asts)
        template_gts = [
            data_tools.cmd2template(cmd, loose_constraints=True)
            for cmd in command_gts
        ]
        template_gt_asts = [cmd_parser(temp) for temp in template_gts]
        if verbose:
            print("Example {}".format(data_id))
            print("Original Source: {}".format(sc_str.encode('utf-8')))
            if vocabs is not None:
                print("Source: {}".format(
                    [x.encode('utf-8') for x in sc_features]))
            for j, command_gt in enumerate(command_gts):
                print("GT Target {}: {}".format(
                    j + 1,
                    command_gt.strip().encode('utf-8')))
        num_eval += 1
        predictions = prediction_list[data_id]
        for i in xrange(len(predictions)):
            pred_cmd = predictions[i]
            pred_ast = cmd_parser(pred_cmd)
            if i == 0:
                pred_ast_list.append(pred_ast)
            pred_temp = data_tools.cmd2template(pred_cmd,
                                                loose_constraints=True)
            # A) Exact match with ground truths & exisitng judgements
            command_example_key = '{}<NL_PREDICTION>{}'.format(
                sc_key, pred_cmd)
            structure_example_key = '{}<NL_PREDICTION>{}'.format(
                sc_key, pred_temp)
            # B) Match ignoring flag orders
            temp_match = tree_dist.one_match(template_gt_asts,
                                             pred_ast,
                                             ignore_arg_value=True)
            str_match = tree_dist.one_match(command_gt_asts,
                                            pred_ast,
                                            ignore_arg_value=False)
            if command_eval_cache and command_example_key in command_eval_cache:
                str_match = normalize_judgement(
                    command_eval_cache[command_example_key]) == 'y'
            if structure_eval_cache and structure_example_key in structure_eval_cache:
                temp_match = normalize_judgement(
                    structure_eval_cache[structure_example_key]) == 'y'
            if temp_match:
                top_k_temp_correct[data_id, i] = 1
            if str_match:
                top_k_str_correct[data_id, i] = 1
            cms = token_based.command_match_score(command_gt_asts, pred_ast)
            # if pred_cmd.strip():
            #     bleu = token_based.sentence_bleu_score(command_gt_asts, pred_ast)
            # else:
            #     bleu = 0
            bleu = nltk.translate.bleu_score.sentence_bleu(
                command_gts, pred_cmd)
            top_k_cms[data_id, i] = cms
            top_k_bleu[data_id, i] = bleu
            if verbose:
                print("Prediction {}: {} ({}, {})".format(
                    i + 1, pred_cmd, cms, bleu))
        if verbose:
            print()

    bleu = token_based.corpus_bleu_score(command_gt_asts_list, pred_ast_list)

    top_temp_acc = [-1 for _ in [1, 3, 5, 10]]
    top_cmd_acc = [-1 for _ in [1, 3, 5, 10]]
    top_cms = [-1 for _ in [1, 3, 5, 10]]
    top_bleu = [-1 for _ in [1, 3, 5, 10]]
    top_temp_acc[0] = top_k_temp_correct[:, 0].mean()
    top_cmd_acc[0] = top_k_str_correct[:, 0].mean()
    top_cms[0] = top_k_cms[:, 0].mean()
    top_bleu[0] = top_k_bleu[:, 0].mean()
    print("{} examples evaluated".format(num_eval))
    print("Top 1 Template Acc = %.3f" % top_temp_acc[0])
    print("Top 1 Command Acc = %.3f" % top_cmd_acc[0])
    print("Average top 1 Template Match Score = %.3f" % top_cms[0])
    print("Average top 1 BLEU Score = %.3f" % top_bleu[0])
    if len(predictions) > 1:
        top_temp_acc[1] = np.max(top_k_temp_correct[:, :3], 1).mean()
        top_cmd_acc[1] = np.max(top_k_str_correct[:, :3], 1).mean()
        top_cms[1] = np.max(top_k_cms[:, :3], 1).mean()
        top_bleu[1] = np.max(top_k_bleu[:, :3], 1).mean()
        print("Top 3 Template Acc = %.3f" % top_temp_acc[1])
        print("Top 3 Command Acc = %.3f" % top_cmd_acc[1])
        print("Average top 3 Template Match Score = %.3f" % top_cms[1])
        print("Average top 3 BLEU Score = %.3f" % top_bleu[1])
    if len(predictions) > 3:
        top_temp_acc[2] = np.max(top_k_temp_correct[:, :5], 1).mean()
        top_cmd_acc[2] = np.max(top_k_str_correct[:, :5], 1).mean()
        top_cms[2] = np.max(top_k_cms[:, :5], 1).mean()
        top_bleu[2] = np.max(top_k_bleu[:, :5], 1).mean()
        print("Top 5 Template Acc = %.3f" % top_temp_acc[2])
        print("Top 5 Command Acc = %.3f" % top_cmd_acc[2])
        print("Average top 5 Template Match Score = %.3f" % top_cms[2])
        print("Average top 5 BLEU Score = %.3f" % top_bleu[2])
    if len(predictions) > 5:
        top_temp_acc[3] = np.max(top_k_temp_correct[:, :10], 1).mean()
        top_cmd_acc[3] = np.max(top_k_str_correct[:, :10], 1).mean()
        top_cms[3] = np.max(top_k_cms[:, :10], 1).mean()
        top_bleu[3] = np.max(top_k_bleu[:, :10], 1).mean()
        print("Top 10 Template Acc = %.3f" % top_temp_acc[3])
        print("Top 10 Command Acc = %.3f" % top_cmd_acc[3])
        print("Average top 10 Template Match Score = %.3f" % top_cms[3])
        print("Average top 10 BLEU Score = %.3f" % top_bleu[3])
    print('Corpus BLEU = %.3f' % bleu)
    print()

    metrics = {}
    metrics['acc_f'] = top_cmd_acc
    metrics['acc_t'] = top_temp_acc
    metrics['cms'] = top_cms
    metrics['bleu'] = top_bleu

    return metrics
コード例 #7
0
def combine_annotations():
    """
    Combine the annotations input by three annotators.

    :param input_file1: main annotation file 1.
    :param input_file2: main annotation file 2 (should contain the same number of
        lines as input_file1).
    :param input_file3: supplementary annotation file which contains annotations
        of lines in input_file1 and input_file2 that contain a disagreement.
    :param output_file: file that contains the combined annotations.
    """
    input_file1 = sys.argv[1]
    input_file2 = sys.argv[2]
    input_file3 = sys.argv[3]
    output_file = sys.argv[4]
    o_f = open(output_file, 'w')
    o_f.write('description,prediction,template,correct template,correct command,'
              'correct template A,correct command A,'
              'correct template B,correct command B,'
              'correct template C,correct command C\n')
    sup_structure_eval, sup_command_eval = load_cached_evaluations_from_file(
        input_file3, treat_empty_as_correct=True)
    # for key in sup_structure_eval:
    #     print(key)
    # print('------------------')
    with open(input_file1) as f1:
        with open(input_file2) as f2:
            reader1 = csv.DictReader(f1)
            reader2 = csv.DictReader(f2)
            current_desp = ''
            for row1, row2 in zip(reader1, reader2):
                row1_template_eval = normalize_judgement(row1['correct template'].strip())
                row1_command_eval = normalize_judgement(row1['correct command'].strip())
                row2_template_eval = normalize_judgement(row2['correct template'].strip())
                row2_command_eval = normalize_judgement(row2['correct command'].strip())
                if row1['description']:
                    current_desp = row1['description'].strip()
                sc_key = get_example_nl_key(current_desp)
                pred_cmd = row1['prediction'].strip()
                if not pred_cmd:
                    row1_template_eval, row1_command_eval = 'n', 'n'
                    row2_template_eval, row2_command_eval = 'n', 'n'
                pred_temp = data_tools.cmd2template(pred_cmd, loose_constraints=True)
                structure_example_key = '{}<NL_PREDICTION>{}'.format(sc_key, pred_temp)
                command_example_key = '{}<NL_PREDICTION>{}'.format(sc_key, pred_cmd)
                row3_template_eval, row3_command_eval = None, None
                if structure_example_key in sup_structure_eval:
                    row3_template_eval = sup_structure_eval[structure_example_key]
                if command_example_key in sup_command_eval:
                    row3_command_eval = sup_command_eval[command_example_key]
                if row1_template_eval != row2_template_eval or row1_command_eval != row2_command_eval:
                    if row1_template_eval != row2_template_eval:
                        if row3_template_eval is None:
                            print(structure_example_key)
                        assert(row3_template_eval is not None)
                        template_eval = row3_template_eval
                    else:
                        template_eval = row1_template_eval
                    if row1_command_eval != row2_command_eval:
                        # if row3_command_eval is None:
                        #     print(command_example_key)
                        assert(row3_command_eval is not None)
                        command_eval = row3_command_eval
                    else:
                        command_eval = row1_command_eval
                else:
                    template_eval = row1_template_eval
                    command_eval = row1_command_eval
                if row3_template_eval is None:
                    row3_template_eval = ''
                if row3_command_eval is None:
                    row3_command_eval = ''
                o_f.write('"{}","{}","{}",{},{},{},{},{},{},{},{}\n'.format(
                    current_desp.replace('"', '""'), pred_cmd.replace('"', '""'), pred_temp.replace('"', '""'),
                    template_eval, command_eval,
                    row1_template_eval, row1_command_eval,
                    row2_template_eval, row2_command_eval,
                    row3_template_eval, row3_command_eval))
    o_f.close()
コード例 #8
0
def print_error_analysis_sheet():
    input_file1 = sys.argv[1]
    input_file2 = sys.argv[2]
    input_file3 = sys.argv[3]
    output_file = sys.argv[4]
    o_f = open(output_file, 'w')
    o_f.write('description,model,prediction,correct template,correct command,'
              'correct template A,correct command A,'
              'correct template B,correct command B,'
              'correct template C,correct command C\n')
    sup_structure_eval, sup_command_eval = load_cached_evaluations_from_file(
        input_file3, treat_empty_as_correct=True)
    # for key in sup_structure_eval:
    #     print(key)
    # print('------------------')
    with open(input_file1) as f1:
        with open(input_file2) as f2:
            reader1 = csv.DictReader(f1)
            reader2 = csv.DictReader(f2)
            current_desp = ''
            for row_id, (row1, row2) in enumerate(zip(reader1, reader2)):
                if row1['description']:
                    current_desp = row1['description'].strip()
                model_name = row2['model']
                if not model_name in ['partial.token-copynet', 'tellina']:
                    continue
                if row_id % 3 != 0:
                    continue
                row1_template_eval = normalize_judgement(row1['correct template'].strip())
                row1_command_eval = normalize_judgement(row1['correct command'].strip())
                row2_template_eval = normalize_judgement(row2['correct template'].strip())
                row2_command_eval = normalize_judgement(row2['correct command'].strip())
                sc_key = get_example_nl_key(current_desp)
                pred_cmd = row1['prediction'].strip()
                if not pred_cmd:
                    row1_template_eval, row1_command_eval = 'n', 'n'
                    row2_template_eval, row2_command_eval = 'n', 'n'
                pred_temp = data_tools.cmd2template(pred_cmd, loose_constraints=True)
                structure_example_key = '{}<NL_PREDICTION>{}'.format(sc_key, pred_temp)
                command_example_key = '{}<NL_PREDICTION>{}'.format(sc_key, pred_cmd)
                row3_template_eval, row3_command_eval = None, None
                if structure_example_key in sup_structure_eval:
                    row3_template_eval = sup_structure_eval[structure_example_key]
                if command_example_key in sup_command_eval:
                    row3_command_eval = sup_command_eval[command_example_key]
                if row1_template_eval != row2_template_eval or row1_command_eval != row2_command_eval:
                    if row1_template_eval != row2_template_eval:
                        if row3_template_eval is None:
                            print(pred_cmd_key, structure_example_key)
                        assert (row3_template_eval is not None)
                        template_eval = row3_template_eval
                    else:
                        template_eval = row1_template_eval
                    if row1_command_eval != row2_command_eval:
                        # if row3_command_eval is None:
                        #     print(command_example_key)
                        assert (row3_command_eval is not None)
                        command_eval = row3_command_eval
                    else:
                        command_eval = row1_command_eval
                else:
                    template_eval = row1_template_eval
                    command_eval = row1_command_eval
                if row3_template_eval is None:
                    row3_template_eval = ''
                if row3_command_eval is None:
                    row3_command_eval = ''
                o_f.write('"{}","{}","{}",{},{},{},{},{},{},{},{}\n'.format(
                    current_desp.replace('"', '""'), model_name, pred_cmd.replace('"', '""'),
                    template_eval, command_eval,
                    row1_template_eval, row1_command_eval,
                    row2_template_eval, row2_command_eval,
                    row3_template_eval, row3_command_eval))
    o_f.close()
コード例 #9
0
ファイル: error_analysis.py プロジェクト: syzer/nl2bash
def gen_manual_evaluation_csv_single_model(dataset, FLAGS):
    """
    Generate .csv spreadsheet for manual evaluation on dev/test set
    examples for a specific model.
    """
    # Group dataset
    tokenizer_selector = "cm" if FLAGS.explain else "nl"
    grouped_dataset = data_utils.group_parallel_data(
        dataset, use_bucket=True, tokenizer_selector=tokenizer_selector)

    # Load model predictions
    model_subdir, decode_sig = graph_utils.get_decode_signature(FLAGS)
    model_dir = os.path.join(FLAGS.model_root_dir, model_subdir)
    prediction_list = load_predictions(model_dir, decode_sig, top_k=3)
    if len(grouped_dataset) != len(prediction_list):
        raise ValueError("ground truth list and prediction list length must "
                         "be equal: {} vs. {}".format(len(grouped_dataset),
                                                      len(prediction_list)))

    # Load additional ground truths
    template_translations, command_translations = load_cached_correct_translations(
        FLAGS.data_dir)

    # Load cached evaluation results
    structure_eval_cache, command_eval_cache = load_cached_evaluations(
        os.path.join(FLAGS.data_dir, 'manual_judgements'))

    eval_bash = FLAGS.dataset.startswith("bash")
    cmd_parser = data_tools.bash_parser if eval_bash else data_tools.paren_parser

    output_path = os.path.join(model_dir, 'manual.evaluations.single.model')
    with open(output_path, 'w') as o_f:
        # write spreadsheet header
        o_f.write('id,description,command,correct template,correct command\n')
        for example_id in range(len(grouped_dataset)):
            data_group = grouped_dataset[example_id][1]
            sc_txt = data_group[0].sc_txt.strip()
            sc_key = get_example_nl_key(sc_txt)
            command_gts = [dp.tg_txt for dp in data_group]
            command_gts = set(command_gts + command_translations[sc_key])
            command_gt_asts = [
                data_tools.bash_parser(cmd) for cmd in command_gts
            ]
            template_gts = [
                data_tools.cmd2template(cmd, loose_constraints=True)
                for cmd in command_gts
            ]
            template_gts = set(template_gts + template_translations[sc_key])
            template_gt_asts = [
                data_tools.bash_parser(temp) for temp in template_gts
            ]
            predictions = prediction_list[example_id]
            for i in xrange(3):
                if i >= len(predictions):
                    o_f.write(',,,n,n\n')
                    continue
                pred_cmd = predictions[i]
                pred_tree = cmd_parser(pred_cmd)
                pred_temp = data_tools.ast2template(pred_tree,
                                                    loose_constraints=True)
                temp_match = tree_dist.one_match(template_gt_asts,
                                                 pred_tree,
                                                 ignore_arg_value=True)
                str_match = tree_dist.one_match(command_gt_asts,
                                                pred_tree,
                                                ignore_arg_value=False)
                # Match ground truths & exisitng judgements
                command_example_sig = '{}<NL_PREDICTION>{}'.format(
                    sc_key, pred_cmd)
                structure_example_sig = '{}<NL_PREDICTION>{}'.format(
                    sc_key, pred_temp)
                command_eval, structure_eval = '', ''
                if str_match:
                    command_eval = 'y'
                    structure_eval = 'y'
                elif temp_match:
                    structure_eval = 'y'
                if command_eval_cache and \
                        command_example_sig in command_eval_cache:
                    command_eval = command_eval_cache[command_example_sig]
                if structure_eval_cache and \
                        structure_example_sig in structure_eval_cache:
                    structure_eval = structure_eval_cache[
                        structure_example_sig]
                if i == 0:
                    o_f.write('{},"{}","{}",{},{}\n'.format(
                        example_id, sc_txt.replace('"', '""'),
                        pred_cmd.replace('"', '""'), structure_eval,
                        command_eval))
                else:
                    o_f.write(',,"{}",{},{}\n'.format(
                        pred_cmd.replace('"', '""'), structure_eval,
                        command_eval))
    print('manual evaluation spreadsheet saved to {}'.format(output_path))