Esempio n. 1
0
    def continue_training(cls,
                          training_data,
                          epochs,
                          model_directory,
                          batch_size=2048,
                          validation_data=None):
        """
        Continue training a model that was already created by a previous training operation.

        :param training_data: text pairs and labels
        :type training_data: pandas.DataFrame
        :param epochs: number of training epochs
        :type epochs: int
        :param model_directory: directory in which to write model checkpoints
        :type model_directory: str or None
        :param batch_size: number of samples per batch
        :type batch_size: int
        :param validation_data: optional validation data
        :type validation_data: pandas.DataFrame or None
        :return: the trained model and its training history
        :rtype: (TextPairClassifier, TrainingHistory)
        """
        model = cls._load(cls._model_filename(model_directory))
        training = TextPairEmbeddingGenerator(
            training_data,
            maximum_tokens=model.maximum_tokens,
            batch_size=batch_size)
        return cls._train(epochs, model, model_directory, training,
                          validation_data)
Esempio n. 2
0
    def fit(self,
            training,
            epochs=1,
            validation_data=None,
            model_directory=None):
        """
        Fit the model to the training data

        :param training: training data generator
        :type training: TextPairEmbeddingGenerator
        :param epochs: number of epochs to train
        :type epochs: int
        :param validation_data: data for validation at the end of each epoch
        :type validation_data: pandas.DataFrame or None
        :param model_directory: directory in which to serialize the model
        :type model_directory: str or None
        :return: training history
        :rtype: keras.callbacks.History
        """
        logger.info("Train model: %d samples, %d epochs, batch size %d" %
                    (len(training), epochs, training.batch_size))
        if validation_data is not None:
            g = TextPairEmbeddingGenerator(validation_data,
                                           maximum_tokens=self.maximum_tokens,
                                           batch_size=training.batch_size)
            validation_embeddings, validation_steps = g(), g.batches_per_epoch
        else:
            validation_embeddings = validation_steps = None
        verbose = {
            logging.INFO: 2,
            logging.DEBUG: 1
        }.get(logger.getEffectiveLevel(), 0)
        if model_directory is not None:
            if validation_data is not None:
                monitor = "val_loss"
            else:
                monitor = "loss"
            callbacks = [
                ModelCheckpoint(filepath=self._model_filename(model_directory),
                                monitor=monitor,
                                save_best_only=True,
                                verbose=verbose)
            ]
        else:
            callbacks = None
        logger.info("Start training")
        return self.model.fit_generator(
            generator=training(),
            steps_per_epoch=training.batches_per_epoch,
            epochs=epochs,
            validation_data=validation_embeddings,
            validation_steps=validation_steps,
            callbacks=callbacks,
            verbose=verbose)
Esempio n. 3
0
 def test_embed_labeled(self):
     g = TextPairEmbeddingGenerator(self.labeled, batch_size=32)
     assert_array_equal([0, 1], g.classes)
     self.assertEqual(100, len(g))
     self.assertEqual(
         "TextPairEmbeddingGenerator: 100 samples, classes [0, 1], batch size 32, maximum tokens 40",
         str(g))
     self.assertEqual(4, g.batches_per_epoch)
     two_epochs = list(islice(g(), 2 * g.batches_per_epoch))
     self._validate_labeled_batches(two_epochs, g.batches_per_epoch, 40,
                                    [32, 32, 32, 4] * 2)
Esempio n. 4
0
 def test_embed_unlabeled(self):
     g = TextPairEmbeddingGenerator(self.unlabeled, batch_size=4)
     self.assertEqual(9, len(g))
     self.assertEqual(
         "TextPairEmbeddingGenerator: 9 samples, batch size 4, maximum tokens 20",
         str(g))
     self.assertEqual(None, g.classes)
     self.assertEqual(3, g.batches_per_epoch)
     two_epochs = list(islice(g(), 2 * g.batches_per_epoch))
     self._validate_unlabeled_batches(two_epochs, g.batches_per_epoch, 20,
                                      [4, 4, 1] * 2)
Esempio n. 5
0
 def test_embed_labeled_specified_maximum_tokens(self):
     g = TextPairEmbeddingGenerator(self.labeled,
                                    batch_size=32,
                                    maximum_tokens=10)
     self.assertEqual(100, len(g))
     self.assertEqual(
         "TextPairEmbeddingGenerator: 100 samples, classes [0, 1], batch size 32, maximum tokens 10",
         str(g))
     self.assertEqual(4, g.batches_per_epoch)
     two_epochs = list(islice(g(), 2 * g.batches_per_epoch))
     self._validate_labeled_batches(two_epochs, g.batches_per_epoch, 10,
                                    [32, 32, 32, 4] * 2)
Esempio n. 6
0
    def train(cls,
              training_data,
              bidirectional,
              lstm_units,
              epochs,
              dropout=None,
              maximum_tokens=None,
              batch_size=2048,
              validation_data=None,
              model_directory=None):
        """
        Train a model from aligned text pairs in data frames.

        :param training_data: text pairs and labels
        :type training_data: pandas.DataFrame
        :param bidirectional: should the shared LSTM be bidirectional?
        :type bidirectional: bool
        :param lstm_units: number of hidden units in the LSTM
        :type lstm_units: int
        :param epochs: number of training epochs
        :type epochs: int
        :param dropout:  dropout rate or None for no dropout
        :type dropout: float or None
        :param maximum_tokens: maximum number of tokens to embed per sample
        :type maximum_tokens: int
        :param batch_size: number of samples per batch
        :type batch_size: int
        :param validation_data: optional validation data
        :type validation_data: pandas.DataFrame or None
        :param model_directory: directory in which to write model checkpoints
        :type model_directory: str or None
        :return: the trained model and its training history
        :rtype: (TextPairClassifier, TrainingHistory)
        """
        training = TextPairEmbeddingGenerator(training_data,
                                              batch_size=batch_size,
                                              maximum_tokens=maximum_tokens)
        model = cls.create(len(training.classes), training.maximum_tokens,
                           embedding_size(), lstm_units, dropout,
                           bidirectional)
        if model_directory is not None:
            os.makedirs(model_directory)
            with open(cls._info_filename(model_directory), "w") as f:
                f.write("%s\n%s\n" % (text_parser_info(), model))
        return cls._train(epochs, model, model_directory, training,
                          validation_data)
Esempio n. 7
0
    def predict(self, test_data, batch_size=2048, class_names=None):
        """
        Predict probability distribution over labels for a test set.

        :param test_data: unlabeled text pair data
        :type test_data: pandas.DataFrame
        :param batch_size: number of test samples per batch
        :type batch_size: int
        :param class_names: optional column names to use for the classes
        :type class_names: list or None
        :return: data frame of test samples and label probabilities
        :rtype: pandas.DataFrame
        """
        g = TextPairEmbeddingGenerator(test_data,
                                       maximum_tokens=self.maximum_tokens,
                                       batch_size=batch_size)
        probabilities = self.model.predict_generator(generator=g(),
                                                     steps=g.batches_per_epoch)
        return pd.DataFrame(probabilities.reshape(
            (len(test_data), self.classes)),
                            columns=class_names)
Esempio n. 8
0
    def score(self, labeled_test_data, batch_size=2048):
        """
        Score the model's performance on a labeled test set.

        :param labeled_test_data: labeled test data
        :type labeled_test_data: pandas.DataFrame
        :param batch_size: number of test samples per batch
        :type batch_size: int
        :return: list of metric names and their corresponding values for the test set
        :rtype: list of (str, float)
        """
        assert label in labeled_test_data
        g = TextPairEmbeddingGenerator(labeled_test_data,
                                       maximum_tokens=self.maximum_tokens,
                                       batch_size=batch_size)
        if not self.classes == len(g.classes):
            raise ValueError(
                "Test data categories %s do not align with the %d labels in the model"
                % (g.classes, self.classes))
        metrics = self.model.evaluate_generator(generator=g(),
                                                steps=g.batches_per_epoch)
        return list(zip(self.model.metrics_names, metrics))