Exemple #1
0
def setup_graph(
    model_config: ModelConfig,
    output_vocab_filepath: str,
    clean_output_vocab_filepath: Optional[str],
    beam_size: int,
):
    """Sets up the Tenorflow graph for inference."""
    # Set up the model for inference
    # model_config = load_config(os.path.join(config_filepath))
    placeholder, features, labels = input_pipeline.create_placeholder_inputs(
        model_config.model_parameters.use_segment_ids,
        model_config.model_parameters.use_foreign_key_features,
        model_config.model_parameters.use_alignment_features,
    )

    model_fn = model_builder.build_model_fn(
        model_config,
        output_vocab_filepath,
        clean_output_vocab_filepath,
        beam_size=beam_size,
    )
    mode = tf.estimator.ModeKeys.PREDICT
    predictions = model_fn(features, labels, mode).predictions

    saver = tf.train.Saver()

    return saver, placeholder, predictions
Exemple #2
0
def main(unused_argv: Any) -> None:
    tf.logging.info("Saving model saves and results to " + FLAGS.model_dir)

    global_seed(42)

    if not FLAGS.do_train and not FLAGS.do_eval:
        raise ValueError("At least one of `do_train`, `do_eval` must be True.")

    config = model_config.load_config(FLAGS.config)

    if FLAGS.do_train:
        tf.logging.info("Training with train filenames: " +
                        str(FLAGS.training_filename))

    # Training allows noisy examples so do not use clean output vocab
    model_fn = model_builder.build_model_fn(config,
                                            FLAGS.output_vocab_filepath,
                                            clean_output_vocab_path="")

    # region training
    if FLAGS.do_train:
        # for keepsake CLI (helps track experiment results)
        experiment = keepsake.init(params={
            "learning_rate": config.training_options.optimizer_learning_rate,
            "batch_size": config.training_options.batch_size,
            "training_steps": config.training_options.training_steps,
            "eval_batch_size": FLAGS.eval_batch_size,
            "training_data": FLAGS.training_filename,
            "eval_data": FLAGS.eval_filename,
        }, )

        train_input_fn = input_pipeline.create_training_input_fn(
            config,
            FLAGS.tf_examples_dir,
            [name for name in FLAGS.training_filename if name],
        )

        train_features, train_labels = train_input_fn()
        train_model = model_fn(train_features, train_labels,
                               tf.estimator.ModeKeys.TRAIN)

        tf.get_variable_scope().reuse_variables()

        inference_config = inference.Config(
            FLAGS.eval_dataset_name,
            FLAGS.eval_splits.split(","),
            FLAGS.output_vocab_filepath,
            FLAGS.clean_output_vocab_filepath,
            FLAGS.eval_beam_size,
            FLAGS.using_abstract_sql,
            FLAGS.database_directory,
            FLAGS.empty_database_directory,
            FLAGS.original_data_directory,
            model_config.load_config(FLAGS.config),
        )

        saver = tf.train.Saver(max_to_keep=None)

        global_step = 0
        checkpoint = checkpoint_path(FLAGS.model_dir, global_step)

        validation_query_cache: Dict[str, Any] = {}

        with tf.Session() as init_sess:
            init_sess.run(tf.global_variables_initializer())
            saver.save(init_sess, checkpoint)

        while global_step < config.training_options.training_steps:
            # region training loop
            with tf.Session() as train_sess:
                tf.logging.info(
                    "Training from step %s to step %s",
                    global_step,
                    global_step + FLAGS.steps_between_saves,
                )
                saver.restore(train_sess, checkpoint)

                train_losses = []

                for step in range(FLAGS.steps_between_saves):
                    _, train_loss = train_sess.run(
                        [train_model.train_op, train_model.loss])

                    train_losses.append(train_loss)

                    if step % 100 == 0:
                        tf.logging.info(
                            "Step %s's training loss: %s",
                            global_step + step,
                            train_loss,
                        )

                train_loss = statistics.mean(train_losses)

                global_step += FLAGS.steps_between_saves
                checkpoint = checkpoint_path(FLAGS.model_dir, global_step)
                saver.save(train_sess, checkpoint)
            # endregion

            # region eval loop
            tf.logging.info("Evaluating checkpoint %s", checkpoint)

            examples = inference.load_tf_examples(
                os.path.join(FLAGS.tf_examples_dir, FLAGS.eval_filename))
            random.shuffle(examples)

            tf.logging.info("Running inference on %s", FLAGS.eval_filename)
            predictions = inference.inference(
                examples,
                checkpoint,
                inference_config,
            )

            examples_to_execute = get_examples_to_execute(
                predictions, inference_config)

            # Only update cache when it's empty
            should_update_cache = len(validation_query_cache) == 0

            # only scholar is case sensitive
            case_sensitive = "scholar" not in FLAGS.eval_dataset_name.lower()

            results, validation_query_cache = official_evaluation.execute_predictions(
                instructions=examples_to_execute,
                cache_dict=validation_query_cache,
                case_sensitive=case_sensitive,
                verbose=False,
                update_cache=should_update_cache,
            )

            metrics = official_evaluation.aggregate_metrics(
                results, FLAGS.use_empty_tables)
            tf.logging.info("Validation Results:\n\tExecution F1: %s",
                            metrics.execution_f1)
            # endregion

            experiment.checkpoint(
                step=global_step,
                metrics={
                    "train_loss": train_loss,
                    "eval_execution_f1": metrics.execution_f1,
                    "eval_string_match": metrics.string_same,
                },
                primary_metric=("eval_execution_f1", "maximize"),
            )

            # region disk management

            for step in checkpoints_to_delete(experiment):
                assert (
                    step != global_step
                ), f"Can't delete step {step}; need it for next training epoch starting at step {global_step}"
                print(f"Deleting checkpoint {step}")
                delete_checkpoint(FLAGS.model_dir, step)
Exemple #3
0
def main(unused_argv):
    tf.logging.info("Saving model saves and results to " + FLAGS.model_dir)

    if not FLAGS.do_train and not FLAGS.do_eval:
        raise ValueError("At least one of `do_train`, `do_eval` must be True.")

    config = model_config.load_config(FLAGS.config)

    if FLAGS.do_train:
        tf.logging.info("Training with train filenames: " +
                        str(FLAGS.training_filename))

    training_options = config.training_options
    use_tpu = FLAGS.use_tpu
    run_config = tf.contrib.tpu.RunConfig(
        master=FLAGS.master,
        model_dir=FLAGS.model_dir,
        save_summary_steps=1,
        save_checkpoints_steps=FLAGS.steps_between_saves,
        keep_checkpoint_max=KEEP_CHECKPOINTS_MAX,
        tpu_config=tf.contrib.tpu.TPUConfig(
            iterations_per_loop=training_options.tpu_iterations_per_loop,
            num_shards=FLAGS.num_tpu_shards))

    # Set up estimator
    model_fn = model_builder.build_model_fn(config, FLAGS.output_vocab,
                                            use_tpu)

    estimator = tf.contrib.tpu.TPUEstimator(
        model_fn=model_fn,
        use_tpu=use_tpu,
        config=run_config,
        train_batch_size=config.training_options.batch_size,
        eval_batch_size=FLAGS.eval_batch_size)

    if FLAGS.do_train:
        train_input_fn = input_pipeline.create_training_input_fn(
            config, FLAGS.tf_examples_dir,
            [name for name in FLAGS.training_filename if name], use_tpu)

        estimator.train(input_fn=train_input_fn,
                        max_steps=config.training_options.training_steps)

    if FLAGS.do_eval:
        max_acc = 0.

        eval_input_fn = input_pipeline.create_eval_input_fn(
            config, FLAGS.tf_examples_dir, [FLAGS.eval_filename], use_tpu)

        # When FLAGS.init_checkpoint = None, the latest checkpoint will be evaluated
        num_train_steps = int(config.training_options.training_steps)

        for ckpt in tf.contrib.training.checkpoints_iterator(FLAGS.model_dir):
            acc = evaluate(estimator, eval_input_fn, ckpt)
            if acc > max_acc:
                copy_checkpoint(
                    ckpt,
                    os.path.join(
                        FLAGS.model_dir,
                        str(get_ckpt_number(ckpt)) + "model_max_" +
                        FLAGS.eval_filename.split(".")[0] + ".ckpt"))
            if get_ckpt_number(ckpt) == num_train_steps:
                break