Ejemplo n.º 1
0
def automatic_eval(model_dir,
                   decode_sig,
                   dataset,
                   FLAGS,
                   top_k,
                   num_samples=-1,
                   verbose=False):
    """
    Generate automatic evaluation metrics on a dev/test set.
    The following metrics are computed:
        Top 1,3,5,10
            1. Structure accuracy
            2. Full command accuracy
            3. Command keyword overlap
            4. BLEU
    """
    use_bucket = False if "knn" in model_dir else True

    grouped_dataset = data_utils.group_parallel_data(dataset,
                                                     use_bucket=use_bucket)
    vocabs = data_utils.load_vocabulary(FLAGS)

    # Load predictions
    prediction_list = load_predictions(model_dir, decode_sig, top_k)
    if len(grouped_dataset) != len(prediction_list):
        raise ValueError("ground truth and predictions length must be equal: "
                         "{} vs. {}".format(len(grouped_dataset),
                                            len(prediction_list)))

    M = get_automatic_evaluation_metrics(grouped_dataset, prediction_list,
                                         vocabs, FLAGS, top_k, num_samples,
                                         verbose)
    return M
Ejemplo n.º 2
0
def automatic_eval(prediction_path,
                   dataset,
                   FLAGS,
                   top_k,
                   num_samples=-1,
                   verbose=False):
    """
    Generate automatic evaluation metrics on dev/test set.
    The following metrics are computed:
        Top 1,3,5,10
            1. Structure accuracy
            2. Full command accuracy
            3. Command keyword overlap
            4. BLEU
    """
    grouped_dataset = data_utils.group_parallel_data(dataset)
    try:
        vocabs = data_utils.load_vocabulary(FLAGS)
    except ValueError:
        vocabs = None

    # Load predictions
    prediction_list = load_predictions(prediction_path, top_k)
    if len(grouped_dataset) != len(prediction_list):
        raise ValueError("ground truth and predictions length must be equal: "
                         "{} vs. {}".format(len(grouped_dataset),
                                            len(prediction_list)))

    metrics = get_automatic_evaluation_metrics(grouped_dataset,
                                               prediction_list, vocabs, FLAGS,
                                               top_k, num_samples, verbose)
    return metrics
Ejemplo n.º 3
0
def demo(sess, model, FLAGS):
    """
    Simple command line decoding interface.
    """
    # Decode from standard input.
    sys.stdout.write('> ')
    sys.stdout.flush()
    sentence = sys.stdin.readline()

    vocabs = data_utils.load_vocabulary(FLAGS)

    while sentence:
        if FLAGS.fill_argument_slots:
            slot_filling_classifier = get_slot_filling_classifer(FLAGS)
            batch_outputs, sequence_logits = translate_fun(
                sentence,
                sess,
                model,
                vocabs,
                FLAGS,
                slot_filling_classifier=slot_filling_classifier)
        else:
            batch_outputs, sequence_logits = translate_fun(
                sentence, sess, model, vocabs, FLAGS)
        if FLAGS.token_decoding_algorithm == 'greedy':
            tree, pred_cmd, outputs = batch_outputs[0]
            score = sequence_logits[0]
            print('{} ({})'.format(pred_cmd, score))
        elif FLAGS.token_decoding_algorithm == 'beam_search':
            if batch_outputs:
                top_k_predictions = batch_outputs[0]
                top_k_scores = sequence_logits[0]
                for j in xrange(min(FLAGS.beam_size, 10,
                                    len(batch_outputs[0]))):
                    if len(top_k_predictions) <= j:
                        break
                    top_k_pred_tree, top_k_pred_cmd = top_k_predictions[j]
                    print('Prediction {}: {} ({}) '.format(
                        j + 1, top_k_pred_cmd, top_k_scores[j]))
                print()
            else:
                print(APOLOGY_MSG)
        print('> ', end='')
        sys.stdout.flush()
        sentence = sys.stdin.readline()
Ejemplo n.º 4
0
def gen_automatic_evaluation_table(dataset, FLAGS):
    # Group dataset
    grouped_dataset = data_utils.group_parallel_data(dataset, use_bucket=True)
    vocabs = data_utils.load_vocabulary(FLAGS)

    model_names, model_predictions = load_all_model_predictions(
        grouped_dataset, FLAGS, top_k=3)

    auto_evaluation_metrics = {}
    for model_id, model_name in enumerate(model_names):
        prediction_list = model_predictions[model_id]
        M = get_automatic_evaluation_metrics(
            grouped_dataset, prediction_list, vocabs, FLAGS, top_k=3)
        auto_evaluation_metrics[model_name] = \
            [M['top_bleu'][0], M['top_bleu'][1], M['top_cms'][0], M['top_cms'][1]]

    metrics_names = ['BLEU1', 'BLEU3', 'TM1', 'TM3']
    print_table(model_names, metrics_names, auto_evaluation_metrics)
Ejemplo n.º 5
0
def gen_automatic_evaluation_table(dataset, FLAGS):
    # Group dataset
    grouped_dataset = data_utils.group_parallel_data(dataset)
    vocabs = data_utils.load_vocabulary(FLAGS)

    model_names, model_predictions = load_all_model_predictions(
        grouped_dataset, FLAGS, top_k=3)
    auto_eval_metrics = {}
    for model_id, model_name in enumerate(model_names):
        prediction_list = model_predictions[model_id]
        if prediction_list is not None:
            M = get_automatic_evaluation_metrics(grouped_dataset,
                                                 prediction_list,
                                                 vocabs,
                                                 FLAGS,
                                                 top_k=3)
            auto_eval_metrics[model_name] = [
                M['bleu'][0], M['bleu'][1], M['cms'][0], M['cms'][1]
            ]
        else:
            print('Model {} skipped in evaluation'.format(model_name))
    metrics_names = ['BLEU1', 'BLEU3', 'TM1', 'TM3']
    print_eval_table(model_names, metrics_names, auto_eval_metrics)
Ejemplo n.º 6
0
FLAGS.sc_vocab_size = 1324
FLAGS.tg_vocab_size = 1219
FLAGS.max_sc_token_size = 100
FLAGS.max_tg_token_size = 100
buckets = [(13, 57), (18, 57), (42, 57)]

# Create tensorflow session
sess = tf.compat.v1.Session(config=tf.compat.v1.ConfigProto(
    allow_soft_placement=True,
    log_device_placement=FLAGS.log_device_placement))

# create model and load nerual model parameters.
model = translate.define_model(sess, forward_only=True, buckets=buckets)
print('loading models from {}'.format(FLAGS.model_dir))

vocabs = data_utils.load_vocabulary(FLAGS)

if FLAGS.fill_argument_slots:
    # Create slot filling classifier
    model_param_dir = os.path.join(FLAGS.model_dir, 'train.mappings.X.Y.npz')
    train_X, train_Y = data_utils.load_slot_filling_data(model_param_dir)
    slot_filling_classifier = classifiers.KNearestNeighborModel(
        FLAGS.num_nn_slot_filling, train_X, train_Y)
    print('Slot filling classifier parameters loaded.')
else:
    slot_filling_classifier = None


def translate_fun(sentence, slot_filling_classifier=slot_filling_classifier):
    print('translating |{}|'.format(sentence))
    list_of_translations = decode_tools.translate_fun(sentence, sess, model,
Ejemplo n.º 7
0
def grid_search(train_fun, decode_fun, eval_fun, train_set, dev_set, FLAGS):
    '''
    Perform hyperparameter tuning of a model using grid-search.

    Usage: ./run-script.sh --grid_search --tuning hp1,...

    :param train_fun: Function to train the model.
    :param decode_fun: Function to decode from the trained model.
    :param eval_fun: Function to evaluate the decoding results.
    :param train_set: Training dataset.
    :param dev_set: Development dataset.
    :param FLAGS: General model hyperparameters.
    '''
    FLAGS.create_fresh_params = True

    hyperparameters = FLAGS.tuning.split(',')
    num_hps = len(hyperparameters)
    hp_range = hyperparam_range

    print('======== Grid Search ========')
    print('%d hyperparameter(s): ' % num_hps)
    for i in xrange(num_hps):
        print('{}: {}'.format(hyperparameters[i],
                              hp_range[hyperparameters[i]]))
    print()

    if FLAGS.dataset.startswith('bash'):
        metrics = [
            'top1_temp_ms', 'top1_cms', 'top3_temp_ms', 'top3_cms',
            'top1_str_ms', 'top3_str_ms'
        ]
        metrics_weights = [0.1875, 0.1875, 0.0625, 0.0625, 0.25, 0.25]
    else:
        metrics = ['top1_temp_ms']
        metrics_weights = [1]
    metrics_signature = '+'.join(
        ['{}x{}'.format(m, mw) for m, mw in zip(metrics, metrics_weights)])

    # Grid search experiment log
    grid_search_log_file_name = 'grid_search_log.{}'.format(FLAGS.channel)
    if FLAGS.use_copy:
        grid_search_log_file_name += '.{}'.format(FLAGS.copy_fun)
    if FLAGS.normalized:
        grid_search_log_file_name += '.normalized'
    grid_search_log_file = open(
        os.path.join(FLAGS.model_root_dir, grid_search_log_file_name), 'w')

    # Generate grid
    param_grid = [v for v in hp_range[hyperparameters[0]]]
    for i in xrange(1, num_hps):
        param_grid = itertools.product(param_grid,
                                       hp_range[hyperparameters[i]])

    # Initialize metrics value
    best_hp_set = [-1] * num_hps
    best_seed = -1
    best_metrics_value = 0

    for row in param_grid:
        row = nest.flatten(row)

        # Set current hyperaramter set
        for i in xrange(num_hps):
            setattr(FLAGS, hyperparameters[i], row[i])
            if hyperparameters[i] == 'universal_keep':
                setattr(FLAGS, 'sc_input_keep', row[i])
                setattr(FLAGS, 'sc_output_keep', row[i])
                setattr(FLAGS, 'tg_input_keep', row[i])
                setattr(FLAGS, 'tg_output_keep', row[i])
                setattr(FLAGS, 'attention_input_keep', row[i])
                setattr(FLAGS, 'attention_output_keep', row[i])

        print('Trying parameter set: ')
        for i in xrange(num_hps):
            print('* {}: {}'.format(hyperparameters[i], row[i]))

        # Try different random seed if tuning initialization
        num_trials = 5 if FLAGS.initialization else 1

        if 'min_vocab_frequency' in hyperparameters or \
                'num_buckets' in hyperparameters:
            # Read train and dev sets from disk
            train_set, dev_set, test_set = \
                data_utils.load_data(FLAGS, use_buckets=True, load_mappings=False)
            vocab = data_utils.load_vocabulary(FLAGS)
            FLAGS.sc_vocab_size = len(vocab.sc_vocab)
            FLAGS.tg_vocab_size = len(vocab.tg_vocab)
            FLAGS.max_sc_token_size = vocab.max_sc_token_size
            FLAGS.max_tg_token_size = vocab.max_tg_token_size

        for t in xrange(num_trials):
            seed = random.getrandbits(32)
            tf.set_random_seed(seed)
            metrics_value = single_round_model_eval(train_fun, decode_fun,
                                                    eval_fun, train_set,
                                                    dev_set, metrics,
                                                    metrics_weights)
            print('Parameter set: ')
            for i in xrange(num_hps):
                print('* {}: {}'.format(hyperparameters[i], row[i]))
            print('random seed: {}'.format(seed))
            print('{} = {}'.format(metrics_signature, metrics_value))
            grid_search_log_file.write('Parameter set: \n')
            for i in xrange(num_hps):
                grid_search_log_file.write('* {}: {}\n'.format(
                    hyperparameters[i], row[i]))
            grid_search_log_file.write('random seed: {}\n'.format(seed))
            grid_search_log_file.write('{} = {}\n\n'.format(
                metrics_signature, metrics_value))
            print('Best parameter set so far: ')
            for i in xrange(num_hps):
                print('* {}: {}'.format(hyperparameters[i], best_hp_set[i]))
            print('Best random seed so far: {}'.format(best_seed))
            print('Best evaluation metrics so far = {}'.format(
                best_metrics_value))
            if metrics_value > best_metrics_value:
                best_hp_set = row
                best_seed = seed
                best_metrics_value = metrics_value
                print('☺ New best parameter setting found\n')

    print()
    print('*****************************')
    print('Best parameter set: ')
    for i in xrange(num_hps):
        print('* {}: {}'.format(hyperparameters[i], best_hp_set[i]))
    print('Best seed = {}'.format(best_seed))
    print('Best {} = {}'.format(metrics, best_metrics_value))
    print('*****************************')
    grid_search_log_file.write('*****************************\n')
    grid_search_log_file.write('Best parameter set: \n')
    for i in xrange(num_hps):
        grid_search_log_file.write('* {}: {}\n'.format(hyperparameters[i],
                                                       best_hp_set[i]))
    grid_search_log_file.write('Best seed = {}\n'.format(best_seed))
    grid_search_log_file.write('Best {} = {}\n'.format(metrics,
                                                       best_metrics_value))
    grid_search_log_file.write('*****************************')
    grid_search_log_file.close()
Ejemplo n.º 8
0
def decode_set(sess, model, dataset, top_k, FLAGS, verbose=False):
    """
    Compute top-k predictions on the dev/test dataset and write the predictions
    to disk.

    :param sess: A TensorFlow session.
    :param model: Prediction model object.
    :param top_k: Number of top predictions to compute.
    :param FLAGS: Training/testing hyperparameter settings.
    :param verbose: If set, also print decoding results to screen.
    """
    nl2bash = FLAGS.dataset.startswith('bash') and not FLAGS.explain

    tokenizer_selector = 'cm' if FLAGS.explain else 'nl'
    grouped_dataset = data_utils.group_parallel_data(
        dataset, okenizer_selector=tokenizer_selector)
    vocabs = data_utils.load_vocabulary(FLAGS)
    rev_sc_vocab = vocabs.rev_sc_vocab

    ts = datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d-%H%M%S')
    pred_file_path = os.path.join(model.model_dir, 'predictions.{}.{}'.format(
        model.decode_sig, ts))
    pred_file = open(pred_file_path, 'w')
    eval_file_path = os.path.join(model.model_dir, 'predictions.{}.{}.csv'.format(
        model.decode_sig, ts))
    eval_file = open(eval_file_path, 'w')
    eval_file.write('example_id, description, ground_truth, prediction, ' +
                    'correct template, correct command\n')
    for example_id in xrange(len(grouped_dataset)):
        key, data_group = grouped_dataset[example_id]

        sc_txt = data_group[0].sc_txt.strip()
        sc_tokens = [rev_sc_vocab[i] for i in data_group[0].sc_ids]
        if FLAGS.channel == 'char':
            sc_temp = ''.join(sc_tokens)
            sc_temp = sc_temp.replace(constants._SPACE, ' ')
        else:
            sc_temp = ' '.join(sc_tokens)
        tg_txts = [dp.tg_txt for dp in data_group]
        tg_asts = [data_tools.bash_parser(tg_txt) for tg_txt in tg_txts]
        if verbose:
            print('\nExample {}:'.format(example_id))
            print('Original Source: {}'.format(sc_txt.encode('utf-8')))
            print('Source: {}'.format(sc_temp.encode('utf-8')))
            for j in xrange(len(data_group)):
                print('GT Target {}: {}'.format(j+1, data_group[j].tg_txt.encode('utf-8')))

        if FLAGS.fill_argument_slots:
            slot_filling_classifier = get_slot_filling_classifer(FLAGS)
            batch_outputs, sequence_logits = translate_fun(data_group, sess, model,
                vocabs, FLAGS, slot_filling_classifier=slot_filling_classifier)
        else:
            batch_outputs, sequence_logits = translate_fun(data_group, sess, model,
                vocabs, FLAGS)
        if FLAGS.tg_char:
            batch_outputs, batch_char_outputs = batch_outputs

        eval_row = '{},"{}",'.format(example_id, sc_txt.replace('"', '""'))
        if batch_outputs:
            if FLAGS.token_decoding_algorithm == 'greedy':
                tree, pred_cmd = batch_outputs[0]
                if nl2bash:
                    pred_cmd = data_tools.ast2command(
                        tree, loose_constraints=True)
                score = sequence_logits[0]
                if verbose:
                    print('Prediction: {} ({})'.format(pred_cmd, score))
                pred_file.write('{}\n'.format(pred_cmd))
            elif FLAGS.token_decoding_algorithm == 'beam_search':
                top_k_predictions = batch_outputs[0]
                if FLAGS.tg_char:
                    top_k_char_predictions = batch_char_outputs[0]
                top_k_scores = sequence_logits[0]
                num_preds = min(FLAGS.beam_size, top_k, len(top_k_predictions))
                for j in xrange(num_preds):
                    if j > 0:
                        eval_row = ',,'
                    if j < len(tg_txts):
                        eval_row += '"{}",'.format(tg_txts[j].strip().replace('"', '""'))
                    else:
                        eval_row += ','
                    top_k_pred_tree, top_k_pred_cmd = top_k_predictions[j]
                    if nl2bash:
                        pred_cmd = data_tools.ast2command(
                            top_k_pred_tree, loose_constraints=True)
                    else:
                        pred_cmd = top_k_pred_cmd
                    pred_file.write('{}|||'.format(pred_cmd.encode('utf-8')))
                    eval_row += '"{}",'.format(pred_cmd.replace('"', '""'))
                    temp_match = tree_dist.one_match(
                        tg_asts, top_k_pred_tree, ignore_arg_value=True)
                    str_match = tree_dist.one_match(
                        tg_asts, top_k_pred_tree, ignore_arg_value=False)
                    if temp_match:
                        eval_row += 'y,'
                    if str_match:
                        eval_row += 'y'
                    eval_file.write('{}\n'.format(eval_row.encode('utf-8')))
                    if verbose:
                        print('Prediction {}: {} ({})'.format(
                            j+1, pred_cmd.encode('utf-8'), top_k_scores[j]))
                        if FLAGS.tg_char:
                            print('Character-based prediction {}: {}'.format(
                                j+1, top_k_char_predictions[j].encode('utf-8')))
                pred_file.write('\n')
        else:
            print(APOLOGY_MSG)
            pred_file.write('\n')
            eval_file.write('{}\n'.format(eval_row))
            eval_file.write('\n')
            eval_file.write('\n')
    pred_file.close()
    eval_file.close()
    shutil.copyfile(pred_file_path, os.path.join(FLAGS.model_dir,
        'predictions.{}.latest'.format(model.decode_sig)))
    shutil.copyfile(eval_file_path, os.path.join(FLAGS.model_dir,
        'predictions.{}.latest.csv'.format(model.decode_sig)))
Ejemplo n.º 9
0
def main(_):
    # set GPU device
    os.environ["CUDA_VISIBLE_DEVICES"] = FLAGS.gpu
    os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
 
    # set up data and model directories
    FLAGS.data_dir = os.path.join(
        os.path.dirname(__file__), "..", "data", FLAGS.dataset)
    print("Reading data from {}".format(FLAGS.data_dir))

    # set up encoder/decider dropout rate
    if FLAGS.universal_keep >= 0 and FLAGS.universal_keep < 1:
        FLAGS.sc_input_keep = FLAGS.universal_keep
        FLAGS.sc_output_keep = FLAGS.universal_keep
        FLAGS.tg_input_keep = FLAGS.universal_keep
        FLAGS.tg_output_keep = FLAGS.universal_keep
        FLAGS.attention_input_keep = FLAGS.universal_keep
        FLAGS.attention_output_keep = FLAGS.universal_keep

    # adjust hyperparameters for batch normalization
    if FLAGS.recurrent_batch_normalization:
        # larger batch size
        FLAGS.batch_size *= 4
        # larger initial learning rate
        FLAGS.learning_rate *= 10

    if FLAGS.decoder_topology in ['basic_tree']:
        FLAGS.model_root_dir = os.path.join(
            os.path.dirname(__file__), "..", FLAGS.model_root_dir, "seq2tree")
    elif FLAGS.decoder_topology in ['rnn']:
        FLAGS.model_root_dir = os.path.join(
            os.path.dirname(__file__), "..", FLAGS.model_root_dir, "seq2seq")
    else:
        raise ValueError("Unrecognized decoder topology: {}."
                         .format(FLAGS.decoder_topology))
    print("Saving models to {}".format(FLAGS.model_root_dir))

    if FLAGS.process_data:
        process_data()

    else:
        train_set, dev_set, test_set = \
            data_utils.load_data(FLAGS, use_buckets=True, load_mappings=False)
        vocab = data_utils.load_vocabulary(FLAGS)

        print("Set dataset parameters")
        FLAGS.max_sc_length = train_set.max_sc_length if not train_set.buckets else \
            train_set.buckets[-1][0]
        FLAGS.max_tg_length = train_set.max_tg_length if not train_set.buckets else \
            train_set.buckets[-1][1]
        FLAGS.sc_vocab_size = len(vocab.sc_vocab)
        FLAGS.tg_vocab_size = len(vocab.tg_vocab)
        FLAGS.max_sc_token_size = vocab.max_sc_token_size
        FLAGS.max_tg_token_size = vocab.max_tg_token_size

        dataset = test_set if FLAGS.test else dev_set
        if FLAGS.eval:
            eval(dataset)
            save_hyperparameters()
        elif FLAGS.gen_error_analysis_sheet:
            gen_error_analysis_sheets(dataset, group_by_utility=True)
        elif FLAGS.gen_manual_evaluation_sheet:
            error_analysis.gen_manual_evaluation_csv(dataset, FLAGS)
        elif FLAGS.gen_manual_evaluation_sheet_single_model:
            error_analysis.gen_manual_evaluation_csv_single_model(dataset, FLAGS)
        elif FLAGS.gen_manual_evaluation_table:
            if FLAGS.test:
                eval_tools.gen_evaluation_table(dataset, FLAGS)
            else:
                eval_tools.gen_evaluation_table(dataset, FLAGS, num_examples=100)
        elif FLAGS.gen_auto_evaluation_table:
            eval_tools.gen_automatic_evaluation_table(dataset, FLAGS)
        elif FLAGS.tabulate_example_predictions:
            error_analysis.tabulate_example_predictions(dataset, FLAGS, num_examples=100)

        elif FLAGS.gen_slot_filling_training_data:
            gen_slot_filling_training_data(FLAGS, [train_set, dev_set, test_set])

        elif FLAGS.decode:
            model = decode(dataset, buckets=train_set.buckets)
            if not FLAGS.explain:
                eval(dataset, model.model_dir, model.decode_sig, verbose=False)

        elif FLAGS.demo:
            demo(buckets=train_set.buckets)

        elif FLAGS.grid_search:
            meta_experiments.grid_search(
                train, decode, eval, train_set, dataset, FLAGS)
        elif FLAGS.schedule_experiments:
            schedule_experiments(
                train, decode, eval, train_set, dataset)
        else:
            # Train the model.
            train(train_set, dataset)

            if FLAGS.normalized:
                tf.reset_default_graph()
                gen_slot_filling_training_data(FLAGS, [train_set, dev_set, test_set])
                FLAGS.fill_argument_slots = True

            # save model hyperparameters
            save_hyperparameters() 

            # Decode the new model on the development set.
            tf.reset_default_graph()
            model = decode(dataset, buckets=train_set.buckets)

            # Run automatic evaluation on the development set.
            if not FLAGS.explain:
                eval(dataset, model.model_dir, model.decode_sig, verbose=False)