Example #1
0
def decode_set(model, dataset, rev_nl_vocab, rev_cm_vocab, verbose=True):
    grouped_dataset = data_utils.group_data_by_nl(dataset)

    with DBConnection() as db:
        db.remove_model(model_name)
        num_eval = 0
        for nl_temp in grouped_dataset:
            batch_nl_strs, batch_cm_strs, batch_nls, batch_cmds = \
                grouped_dataset[nl_temp]

            nl_str = batch_nl_strs[0]
            nl = batch_nls[0]
            if verbose:
                print("Example {}".format(num_eval+1))
                print("Original English: " + nl_str.strip())
                print("English: " + nl_temp)
                for j in xrange(len(batch_cm_strs)):
                    print("GT Command {}: {}".format(j+1, batch_cm_strs[j].strip()))
            top_k_results = model.test(nl, 10)
            for i in xrange(len(top_k_results)):
                nn, cmd, score = top_k_results[i]
                nn_str = ' '.join([rev_nl_vocab[i] for i in nn])
                tokens = []
                for i in cmd:
                    pred_token = rev_cm_vocab[i]
                    if "@@" in pred_token:
                        pred_token = pred_token.split("@@")[-1]
                    tokens.append(pred_token)
                pred_cmd = ' '.join(tokens)
                tree = data_tools.bash_parser(pred_cmd)
                if verbose:
                    print("NN: {}".format(nn_str))
                    print("Prediction {}: {} ({})".format(i, pred_cmd, score))
                    print("AST: ")
                    data_tools.pretty_print(tree, 0)
                    print
                db.add_prediction(model_name, nl_str, pred_cmd, float(score),
                                  update_mode=False)
            
            num_eval += 1
    print("train entropy = {}".format(entropy(train_set)))
    dev_by_nl = group_data_by_nl(dev_set, use_nl_temp=FLAGS.dataset.startswith("bash"))
    print("dev cmd/nl ratio = {}".format(ratio(dev_by_nl, 1)))
    print("dev %nl(cmd+) = {}".format(pp(dev_by_nl)))
    print("dev nl overlap = {}".format(overlap(train_by_nl, dev_by_nl)))
    print("dev entropy = {}".format(entropy(dev_set)))
    test_by_nl = group_data_by_nl(test_set, use_nl_temp=FLAGS.dataset.startswith("bash"))
    print("test cmd/nl ratio = {}".format(ratio(test_by_nl, 1)))
    print("test %nl(cmd+) = {}".format(pp(test_by_nl)))
    print("test nl overlap = {}".format(overlap(train_by_nl, test_by_nl)))
    print("test entropy = {}".format(entropy(test_set)))
    print("total entropy = {}".format(entropy(train_set + dev_set + test_set)))
    train_by_cm = group_data_by_cm(train_set, use_cm_temp=FLAGS.dataset.startswith("bash"))
    print(len(train_by_cm))
    print("train nl/cmd ratio = {}".format(ratio(train_by_cm, 0)))
    print("train %cmd(nl+) = {}".format(pp(train_by_cm)))
    dev_by_cm = group_data_by_cm(dev_set, use_cm_temp=FLAGS.dataset.startswith("bash"))
    print("dev nl/cmd ratio = {}".format(ratio(dev_by_cm, 0)))
    print("dev %cmd(nl+) = {}".format(pp(dev_by_cm)))
    print("dev cm overlap = {}".format(overlap(train_by_cm, dev_by_cm)))
    test_by_cm = group_data_by_cm(test_set, use_cm_temp=FLAGS.dataset.startswith("bash"))
    print("test nl/cmd ratio = {}".format(ratio(test_by_cm, 0)))
    print("test %cmd(nl+) = {}".format(pp(test_by_cm)))
    print("test cm overlap = {}".format(overlap(train_by_cm, test_by_cm)))

if __name__ == "__main__":
    ast = data_tools.paren_parser(sys.argv[1])
    data_tools.pretty_print(ast)
    print(data_tools.ast2template(
            ast, loose_constraints=True, arg_type_only=True))