def test_generate_train_batches(self):
        get_train_gen, train_size = self.data_manager.get_train_data_from_file(
            [self.TRAIN_FILE])
        batch_gen = DataManager.get_batch_generator(get_train_gen, 2)
        new_batch_gen = DataManager.get_batch_generator(get_train_gen, 2)

        # Assert that the new generator is a different object
        # than the old generator.
        assert new_batch_gen != batch_gen
        assert train_size == 3

        first_batch = batch_gen.__next__()
        new_first_batch = new_batch_gen.__next__()
        inputs, labels = first_batch
        new_inputs, new_labels = new_first_batch
        assert len(inputs) == len(new_inputs) == 2
        assert len(labels) == len(new_labels) == 1

        # Ensure output matches ground truth
        assert_allclose(inputs[0], np.array([[2, 0], [5, 0]]))
        assert_allclose(inputs[1], np.array([[3, 4], [6, 0]]))
        assert_allclose(labels[0], np.array([[1, 0], [0, 1]]))
        # Ensure both generators produce same results.
        assert_allclose(inputs[0], new_inputs[0])
        assert_allclose(inputs[1], new_inputs[1])
        assert_allclose(labels[0], labels[0])

        second_batch = batch_gen.__next__()
        new_second_batch = new_batch_gen.__next__()
        inputs, labels = second_batch
        new_inputs, new_labels = new_second_batch
        assert len(inputs) == len(new_inputs) == 2
        assert len(labels) == len(new_labels) == 1

        # Ensure output matches ground truth
        assert_allclose(inputs[0], np.array([[7, 0]]))
        assert_allclose(inputs[1], np.array([[8, 0]]))
        assert_allclose(labels[0], np.array([[1, 0]]))
        # Ensure both generators produce same results.
        assert_allclose(inputs[0], new_inputs[0])
        assert_allclose(inputs[1], new_inputs[1])
        assert_allclose(labels[0], labels[0])

        # Should raise a StopIteration
        with self.assertRaises(StopIteration):
            batch_gen.__next__()
            new_batch_gen.__next__()
Beispiel #2
0
    def _evaluate_on_validation(self, get_val_feature_generator, batch_size,
                                num_val_steps, session):
        val_batch_gen = DataManager.get_batch_generator(
            get_val_feature_generator, batch_size)
        # Calculate the mean of the validation metrics
        # over the validation set.
        val_accuracies = []
        val_losses = []
        for val_batch in tqdm(val_batch_gen,
                              total=num_val_steps,
                              desc="Validation Batches Completed",
                              leave=False):

            #Ignore the last batch if the size doesn't match
            # if(len(val_batch) != batch_size):
            #     continue
            feed_dict = self._get_validation_feed_dict(val_batch)
            val_batch_acc, val_batch_loss = session.run(
                [self.eval_metric, self.loss], feed_dict=feed_dict)

            val_accuracies.append(val_batch_acc)
            val_losses.append(val_batch_loss)

        # Take the mean of the accuracies and losses.
        # TODO/FIXME this assumes each batch is same shape, which
        # is not necessarily true.
        mean_val_accuracy = np.mean(val_accuracies)
        mean_val_loss = np.mean(val_losses)

        # Create a new Summary object with mean_val accuracy
        # and mean_val_loss and add it to Tensorboard.
        val_summary = tf.Summary(value=[
            tf.Summary.Value(tag="_val_summaries/loss",
                             simple_value=mean_val_loss),
            tf.Summary.Value(tag="_val_summaries/accuracy",
                             simple_value=mean_val_accuracy)
        ])
        return mean_val_accuracy, mean_val_loss, val_summary
Beispiel #3
0
    def test(self,
             get_test_instance_generator,
             model_load_dir,
             batch_size,
             num_test_steps=None):
        """
        Load a serialized model and use it for prediction on a test
        set (from a finite generator).

        :param get_test_instance_generator: Function returning generator
            This function should return a finite generator that produces features
            for use in training.

        :param model_load_dir: str
            Path to a directory with serialized tensorflow checkpoints for the
            model to be run. The most recent checkpoint will be loaded and used
            for prediction.

        :param batch_size: int
            The number of features per batch produced by the generator.

        :param num_test_steps: int
            The number of steps (calculated by ceil(total # test examples / batch_size))
            in testing. This does not have any effect on how much of the test data
            is read; inference keeps going until the generator is exhausted. It
            is used to set a total for the progress bar.
        """
        previous_mode = self._mode
        self._mode = 'predict'

        if num_test_steps is None:
            logger.info("num_test_steps is not set, pass in a value "
                        "to show a progress bar.")

        gpu_options = tf.GPUOptions(allow_growth=True)
        sess_config = tf.ConfigProto(gpu_options=gpu_options,
                                     allow_soft_placement=True,
                                     log_device_placement=True)
        with tf.Session(config=sess_config) as sess:
            saver = tf.train.Saver()
            logger.info(
                "Getting latest checkpoint in {}".format(model_load_dir))
            last_checkpoint = tf.train.latest_checkpoint(model_load_dir)
            logger.info(
                "Attempting to load checkpoint at {}".format(last_checkpoint))
            saver.restore(sess, last_checkpoint)
            logger.info("Successfully loaded {}!".format(last_checkpoint))

            # Get a generator of test batches
            test_batch_gen = DataManager.get_batch_generator(
                get_test_instance_generator, batch_size)

            y_pred = []
            for batch in tqdm(test_batch_gen,
                              total=num_test_steps,
                              desc="Test Batches Completed"):
                feed_dict = self._get_test_feed_dict(batch)
                y_pred_batch = sess.run(self.prediction, feed_dict=feed_dict)
                y_pred.append(y_pred_batch)
            y_pred_flat = np.concatenate(y_pred, axis=0)

        self._mode = previous_mode
        return y_pred_flat
Beispiel #4
0
    def train(self,
              get_train_feature_generator,
              get_val_feature_generator,
              batch_size,
              num_epochs,
              num_train_steps_per_epoch,
              num_val_steps,
              val_period,
              log_period,
              save_period,
              max_ckpts_to_keep=10,
              patience=0):
        """
        Train the model.

        :param get_train_instance_generator: Function returning generator
            This function should return a finite generator that produces
            features for use in training.

        :param get_val_feature_generator: Function returning generator
            This function should return a finite generator that produces
            features for use in validation.

        :param batch_size: int
            The number of features per batch produced by the generator.

        :param num_train_steps_per_epoch: int
            The number of training steps after which an epoch has passed.

        :param num_epochs: int
            The number of epochs to train for.

        :param num_val_steps: int
            The number of batches generated by the validation batch generator.

        :param save_path: str
            The input path to the tensorflow Saver responsible for
            checkpointing.

        :param log_path: str
            The input path to the tensorflow SummaryWriter responsible for
            logging the progress.

        :param val_period: int, optional (default=250)
            Number of steps between each evaluation of performance on the
            held-out validation set.

        :param log_period: int, optional (default=10)
            Number of steps between each summary op evaluation.

        :param save_period: int, optional (default=250)
            Number of steps between each model checkpoint.

        :param max_ckpts_to_keep: int, optional (default=10)
            The maximum number of model to checkpoints to keep.

        :param patience: int, optional (default=0)
            The number of epochs with no improvement in validation loss
            after which training will be stopped.
        """
        previous_mode = self._mode
        self._mode = 'train'
        global_step = 0
        init_op = tf.global_variables_initializer()

        gpu_options = tf.GPUOptions(allow_growth=True)
        sess_config = tf.ConfigProto(gpu_options=gpu_options,
                                     allow_soft_placement=True,
                                     log_device_placement=True)
        with tf.Session(config=sess_config) as sess:
            sess.run(init_op)

            self._setup_summaries(sess=sess)

            epoch_validation_losses = []
            # Iterate over a generator that returns batches.
            for epoch in tqdm(range(num_epochs), desc="Epochs Completed"):
                # Get a generator of train batches
                train_batch_gen = DataManager.get_batch_generator(
                    get_train_feature_generator, batch_size)
                # Iterate over the generated batches
                for train_batch in tqdm(train_batch_gen,
                                        total=num_train_steps_per_epoch,
                                        desc="Train Batches Completed",
                                        leave=False):

                    global_step = sess.run(self.global_step) + 1

                    feed_dict = self._get_train_feed_dict(train_batch)

                    # Do a gradient update, and log results to Tensorboard
                    # if necessary.
                    if global_step % log_period == 0:
                        # Record summary with gradient update
                        train_loss, _, train_summary = sess.run(
                            [self.loss, self.optimizer, self.train_summary_op],
                            feed_dict=feed_dict)
                        self._train_summary_writer.add_summary(
                            train_summary, global_step)
                    else:
                        # Do a gradient update without recording anything.
                        train_loss, _ = sess.run([self.loss, self.optimizer],
                                                 feed_dict=feed_dict)

                    if global_step % val_period == 0:
                        # Evaluate on validation data
                        val_acc, val_loss, val_summary = self._evaluate_on_validation(
                            get_val_feature_generator=get_val_feature_generator,
                            batch_size=batch_size,
                            num_val_steps=num_val_steps,
                            session=sess)
                        self._val_summary_writer.add_summary(
                            val_summary, global_step)
                    # Write a model checkpoint if necessary.
                    if global_step % save_period == 0:
                        ret = self._saver.save(sess,
                                               self._save_dir + '/' +
                                               self._name,
                                               global_step=global_step)
                        logger.info('Saving final model @ ' +
                                    os.path.abspath(ret))

                # End of the epoch, so save the model and check validation loss,
                # stopping if applicable.
                model_path = self._saver.save(sess,
                                              self._save_dir + '/' +
                                              self._name,
                                              global_step=global_step)
                logger.info('Saving final model @ ' +
                            os.path.abspath(model_path))

                val_acc, val_loss, val_summary = self._evaluate_on_validation(
                    get_val_feature_generator=get_val_feature_generator,
                    batch_size=batch_size,
                    num_val_steps=num_val_steps,
                    session=sess)
                self._val_summary_writer.add_summary(val_summary, global_step)

                epoch_validation_losses.append(val_loss)

                # Get the lowest validation loss, with regards to the patience
                # threshold.
                patience_val_losses = epoch_validation_losses[:-(patience + 1)]
                if patience_val_losses:
                    min_patience_val_loss = min(patience_val_losses)
                else:
                    min_patience_val_loss = math.inf
                if min_patience_val_loss <= val_loss:
                    # past loss was lower, so stop
                    logger.info("Validation loss of {} in last {} "
                                "epochs, which is lower than current "
                                "epoch validation loss of {}; stopping "
                                "early.".format(min_patience_val_loss,
                                                patience, val_loss))
                    break

            #Evaluate model specific evaluations
            self._evaluate_model_parameters(sess)

        # Done training!
        logger.info("Finished {} epochs!".format(epoch + 1))
        self._mode = previous_mode
        return os.path.abspath(model_path)
class TestDataManagerTest(DuplicateTestCase):
    @overrides
    def setUp(self):
        super(TestDataManagerTest, self).setUp()
        self.write_duplicate_questions_train_file()
        self.write_duplicate_questions_test_file()
        self.data_manager = DataManager(PairFeature)
        self.data_manager.get_train_data_from_file([self.TRAIN_FILE])

    def test_get_test_data_default(self):
        get_test_gen, test_size = self.data_manager.get_test_data_from_file(
            [self.TEST_FILE])
        assert test_size == 3
        test_gen = get_test_gen()
        inputs1, labels1 = test_gen.__next__()
        assert_allclose(inputs1[0], np.array([2, 1]))
        assert_allclose(inputs1[1], np.array([1, 0]))

        inputs2, labels2 = test_gen.__next__()
        assert_allclose(inputs2[0], np.array([4, 0]))
        assert_allclose(inputs2[1], np.array([5, 1]))

        inputs3, labels3 = test_gen.__next__()
        assert_allclose(inputs3[0], np.array([6, 0]))
        assert_allclose(inputs3[1], np.array([7, 0]))

        # Should raise a StopIteration
        with self.assertRaises(StopIteration):
            test_gen.__next__()

        # Test that we can make a new test generator
        new_test_gen = get_test_gen()
        # Verify that the new and old generator are not the same object
        assert new_test_gen != test_gen
        new_inputs1, new_labels1 = new_test_gen.__next__()
        assert_allclose(new_inputs1, inputs1)
        assert_allclose(new_labels1, labels1)
        new_inputs2, new_labels2 = new_test_gen.__next__()
        assert_allclose(new_inputs2, inputs2)
        assert_allclose(new_labels2, labels2)
        new_inputs3, new_labels3 = new_test_gen.__next__()
        assert_allclose(new_inputs3, inputs3)
        assert_allclose(new_labels3, labels3)
        # Should raise a StopIteration
        with self.assertRaises(StopIteration):
            new_test_gen.__next__()

    def test_get_test_data_default_character(self):
        get_test_gen, test_size = self.data_manager.get_test_data_from_file(
            [self.TEST_FILE], mode="character")
        test_gen = get_test_gen()
        assert test_size == 3
        inputs1, labels = test_gen.__next__()
        assert_allclose(
            inputs1[0],
            np.array([[6, 9, 2, 7, 8, 3, 5, 4, 10, 0, 0, 0],
                      [6, 9, 2, 7, 8, 3, 5, 4, 9, 4, 1, 10]]))
        assert_allclose(
            inputs1[1],
            np.array([[6, 9, 2, 7, 8, 3, 5, 4, 9, 4, 1, 11],
                      [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]))
        assert len(labels) == 0

        inputs2, labels = test_gen.__next__()
        assert_allclose(
            inputs2[0],
            np.array([[6, 9, 2, 7, 8, 3, 5, 4, 12, 19, 17, 18],
                      [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]))
        assert_allclose(
            inputs2[1],
            np.array([[6, 9, 2, 7, 8, 3, 5, 4, 13, 0, 0, 0],
                      [6, 9, 2, 7, 8, 3, 5, 4, 9, 4, 1, 12]]))
        assert len(labels) == 0

        inputs3, labels = test_gen.__next__()
        assert_allclose(
            inputs3[0],
            np.array([[6, 9, 2, 7, 8, 3, 5, 4, 14, 0, 0, 0],
                      [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]))
        assert_allclose(
            inputs3[1],
            np.array([[6, 9, 2, 7, 8, 3, 5, 4, 15, 0, 0, 0],
                      [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]))
        assert len(labels) == 0
        # Should raise a StopIteration
        with self.assertRaises(StopIteration):
            test_gen.__next__()

    def test_get_test_data_default_word_and_character(self):
        get_test_gen, test_size = self.data_manager.get_test_data_from_file(
            [self.TEST_FILE], mode="word+character")
        test_gen = get_test_gen()
        assert test_size == 3
        inputs1, labels = test_gen.__next__()
        assert_allclose(inputs1[0], np.array([2, 1]))
        assert_allclose(
            inputs1[1],
            np.array([[6, 9, 2, 7, 8, 3, 5, 4, 10, 0, 0, 0],
                      [6, 9, 2, 7, 8, 3, 5, 4, 9, 4, 1, 10]]))
        assert_allclose(inputs1[2], np.array([1, 0]))
        assert_allclose(
            inputs1[3],
            np.array([[6, 9, 2, 7, 8, 3, 5, 4, 9, 4, 1, 11],
                      [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]))
        assert len(labels) == 0

        inputs2, labels = test_gen.__next__()
        assert_allclose(inputs2[0], np.array([4, 0]))
        assert_allclose(
            inputs2[1],
            np.array([[6, 9, 2, 7, 8, 3, 5, 4, 12, 19, 17, 18],
                      [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]))
        assert_allclose(inputs2[2], np.array([5, 1]))
        assert_allclose(
            inputs2[3],
            np.array([[6, 9, 2, 7, 8, 3, 5, 4, 13, 0, 0, 0],
                      [6, 9, 2, 7, 8, 3, 5, 4, 9, 4, 1, 12]]))
        assert len(labels) == 0

        inputs3, labels = test_gen.__next__()
        assert_allclose(inputs3[0], np.array([6, 0]))
        assert_allclose(
            inputs3[1],
            np.array([[6, 9, 2, 7, 8, 3, 5, 4, 14, 0, 0, 0],
                      [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]))
        assert_allclose(inputs3[2], np.array([7, 0]))
        assert_allclose(
            inputs3[3],
            np.array([[6, 9, 2, 7, 8, 3, 5, 4, 15, 0, 0, 0],
                      [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]))
        assert len(labels) == 0

        # Should raise a StopIteration
        with self.assertRaises(StopIteration):
            test_gen.__next__()

    def test_get_test_data_pad_with_max_lens(self):
        get_test_gen, test_size = self.data_manager.get_test_data_from_file(
            [self.TEST_FILE], max_lengths={"num_sentence_words": 1})
        test_gen = get_test_gen()
        assert test_size == 3

        inputs, labels = test_gen.__next__()
        assert_allclose(inputs[0], np.array([2]))
        assert_allclose(inputs[1], np.array([1]))
        assert len(labels) == 0

        inputs, labels = test_gen.__next__()
        assert_allclose(inputs[0], np.array([4]))
        assert_allclose(inputs[1], np.array([5]))
        assert len(labels) == 0

        inputs, labels = test_gen.__next__()
        assert_allclose(inputs[0], np.array([6]))
        assert_allclose(inputs[1], np.array([7]))
        assert len(labels) == 0

        # Should raise a StopIteration
        with self.assertRaises(StopIteration):
            test_gen.__next__()

    def test_get_test_data_with_max_features(self):
        get_test_gen, test_size = self.data_manager.get_test_data_from_file(
            [self.TEST_FILE], max_features=2)
        test_gen = get_test_gen()
        assert test_size == 2

        inputs, labels = test_gen.__next__()
        assert_allclose(inputs[0], np.array([2, 1]))
        assert_allclose(inputs[1], np.array([1, 0]))
        assert len(labels) == 0

        inputs, labels = test_gen.__next__()
        assert_allclose(inputs[0], np.array([4, 0]))
        assert_allclose(inputs[1], np.array([5, 1]))
        assert len(labels) == 0

        # Should raise a StopIteration
        with self.assertRaises(StopIteration):
            test_gen.__next__()

    def test_get_test_data_errors(self):
        with self.assertRaises(ValueError):
            self.data_manager.get_test_data_from_file(
                [self.TEST_FILE],
                max_lengths={"num_sentence_words": 1},
                pad=False)
        with self.assertRaises(ValueError):
            self.data_manager.get_test_data_from_file(
                [self.TEST_FILE], max_lengths={"some wrong key": 1})

    def test_get_test_data_no_pad(self):
        get_test_gen, test_size = self.data_manager.get_test_data_from_file(
            [self.TEST_FILE], pad=False)
        test_gen = get_test_gen()
        assert test_size == 3

        inputs, labels = test_gen.__next__()
        assert_allclose(inputs[0], np.array([2, 1, 2]))
        assert_allclose(inputs[1], np.array([1]))
        assert len(labels) == 0

        inputs, labels = test_gen.__next__()
        assert_allclose(inputs[0], np.array([4]))
        assert_allclose(inputs[1], np.array([5, 1]))
        assert len(labels) == 0

        inputs, labels = test_gen.__next__()
        assert_allclose(inputs[0], np.array([6]))
        assert_allclose(inputs[1], np.array([7]))
        assert len(labels) == 0

        # Should raise a StopIteration
        with self.assertRaises(StopIteration):
            test_gen.__next__()

    def test_generate_test_batches(self):
        get_test_gen, test_size = self.data_manager.get_test_data_from_file(
            [self.TEST_FILE])
        batch_gen = self.data_manager.get_batch_generator(get_test_gen, 2)
        new_batch_gen = DataManager.get_batch_generator(get_test_gen, 2)

        # Assert that the new generator is a different object
        # than the old generator.
        assert new_batch_gen != batch_gen
        assert test_size == 3

        first_batch = batch_gen.__next__()
        new_first_batch = new_batch_gen.__next__()
        inputs, labels = first_batch
        new_inputs, new_labels = new_first_batch
        assert len(inputs) == 2
        assert len(labels) == 0

        # Ensure output matches ground truth
        assert_allclose(inputs[0], np.array([[2, 1], [4, 0]]))
        assert_allclose(inputs[1], np.array([[1, 0], [5, 1]]))
        # Ensure both generators produce same results.
        assert_allclose(inputs[0], new_inputs[0])
        assert_allclose(inputs[1], new_inputs[1])

        second_batch = batch_gen.__next__()
        new_second_batch = new_batch_gen.__next__()
        inputs, labels = second_batch
        new_inputs, new_labels = new_second_batch
        assert len(inputs) == 2
        assert len(labels) == 0

        # Ensure output matches ground truth
        assert_allclose(inputs[0], np.array([[6, 0]]))
        assert_allclose(inputs[1], np.array([[7, 0]]))
        # Ensure both generators produce same results.
        assert_allclose(inputs[0], new_inputs[0])
        assert_allclose(inputs[1], new_inputs[1])

        with self.assertRaises(StopIteration):
            batch_gen.__next__()
            new_batch_gen.__next__()