Ejemplo n.º 1
0
    def test_RestoreIncorrectSummaries(self, request):
        """
        Tests that an exception is thrown if a summary name is provided which couldn't be found in
        the graph. This could happen where a one of the summary names provided to the constructor
        Didn't exist in the previous run.
        """
        tf.reset_default_graph()

        # Setup our helpers, use a single summary
        MODEL_DIR = request.node.name
        START_DATETIME = datetime.utcnow().strftime("%Y%m%d-%H%M")
        manager = FileManager(MODEL_DIR)
        logHelper = TensorboardLogHelper(manager.getModelDir(),
                                         tf.get_default_graph(), ["test1"],
                                         False)

        # Write once
        with tf.Session() as sess:
            logHelper.writeSummary(sess, [5.0])

        logHelper.close()

        # Now restore, but using an incorrect session name
        with pytest.raises(KeyError):
            logHelper = TensorboardLogHelper(manager.getModelDir(),
                                             tf.get_default_graph(), ["test2"],
                                             True)
Ejemplo n.º 2
0
    def test_FileCreation(self, request):
        """
        Tests if the correct files are generated when a model is saved.
        """

        # Lets add some basic operators to the graph
        tf.reset_default_graph()
        A = tf.Variable(10, dtype=tf.float32)
        B = tf.Variable(15, dtype=tf.float32)
        init = tf.global_variables_initializer()

        # Setup our helpers
        MODEL_DIR = request.node.name
        START_DATETIME = datetime.utcnow().strftime("%Y%m%d-%H%M")
        fileManager = FileManager(MODEL_DIR, None)
        restoreHelper = CheckpointAndRestoreHelper(
            fileManager.getModelDirAndPrefix(), False, tf.get_default_graph())

        # Now we'll save the model (we don't actually need to train anything)
        with tf.Session() as sess:
            init.run()
            restoreHelper.saveCheckpoint(sess, 0)

        # Check all files are created
        for filename in [
                "checkpoint", "model.ckpt.data-00000-of-00001",
                "model.ckpt.epoch", "model.ckpt.index", "model.ckpt.meta"
        ]:
            assert os.path.isfile(
                str(pathlib.Path.cwd() / "models" / MODEL_DIR /
                    START_DATETIME / filename))
Ejemplo n.º 3
0
    def test_WriteStaticAndDynamicSummaries(self, request):
        """
        Tests that both static and dynamic summaries are written for several iterations.
        """
        tf.reset_default_graph()

        # Build a simple graph with one statically defined summary
        A = tf.Variable(10, dtype=tf.float32, name="A")
        tf.summary.scalar("static", A)
        init = tf.global_variables_initializer()

        # Setup our helpers, use a single dynamic summary
        MODEL_DIR = request.node.name
        START_DATETIME = datetime.utcnow().strftime("%Y%m%d-%H%M")
        manager = FileManager(MODEL_DIR)
        logHelper = TensorboardLogHelper(manager.getModelDir(),
                                         tf.get_default_graph(), ["dynamic"],
                                         False)

        with tf.Session() as sess:
            init.run()
            logHelper.writeSummary(sess, [5.0])

        logHelper.close()

        # Manually inspect the tensorboard log
        ea = event_accumulator.EventAccumulator(
            str(pathlib.Path.cwd() / "models" / MODEL_DIR / START_DATETIME))
        ea.Reload()
        assert ea.Scalars("static")[0].value == 10
        assert ea.Scalars("TensorboardLogHelper/dynamic_1")[0].value == 5
Ejemplo n.º 4
0
    def test_RestoreMetaFail(self, request):
        """
        Tests if the model fails to restore correctly when there is no meta graph to restore from.
        """
        fileManager = FileManager(request.node.name, None)

        with pytest.raises(OSError):
            restoreHelper = CheckpointAndRestoreHelper(
                fileManager.getModelDirAndPrefix(), True,
                tf.get_default_graph())
Ejemplo n.º 5
0
    def test_WithRestore(self):
        """
        Tests if the correct model paths are generated if a model to restore from is provided.
        """

        RESTORE_FROM = "20171225-1200"
        manager = FileManager("TestRegressor", RESTORE_FROM)

        assert manager.getModelDir(
        ) == os.getcwd() + "/models/TestRegressor/" + RESTORE_FROM
        assert manager.getModelDirAndPrefix(
        ) == os.getcwd() + "/models/TestRegressor/" + RESTORE_FROM + "/model"
Ejemplo n.º 6
0
    def test_WithoutRestore(self):
        """
        Tests if the correct model paths are generated if no model to restore from is provided.
        """

        manager = FileManager("TestRegressor")
        TIMESTAMP = datetime.utcnow().strftime("%Y%m%d-%H%M")

        assert manager.getModelDir(
        ) == os.getcwd() + "/models/TestRegressor/" + TIMESTAMP
        assert manager.getModelDirAndPrefix(
        ) == os.getcwd() + "/models/TestRegressor/" + TIMESTAMP + "/model"
Ejemplo n.º 7
0
    def test_IncorrectSummaries(self, request):
        """
        Tests that providing an incorrect number of summary values causes an exception.
        """
        tf.reset_default_graph()

        manager = FileManager(request.node.name)
        logHelper = TensorboardLogHelper(manager.getModelDir(),
                                         tf.get_default_graph(), ["summ"],
                                         False)

        with tf.Session() as sess:
            with pytest.raises(ValueError):
                logHelper.writeSummary(sess, [0, 1])

        logHelper.close()
Ejemplo n.º 8
0
    def test_RestoreFreshRun(self, request):
        """
        Tests if the model can be restored during a training run where we reload the meta graph.
        This would be the most common use case, where the process was been interrupted and we have
        to load the graph from files.
        """

        # Lets add some basic operators to the graph
        tf.reset_default_graph()
        A = tf.Variable(10, dtype=tf.float32, name="A")
        B = tf.Variable(15, dtype=tf.float32, name="B")
        init = tf.global_variables_initializer()

        # And some ops so that we can mess with the variables
        A_mod = A.assign(5)
        B_mod = B.assign(5)

        # Setup our helpers
        fileManager = FileManager(request.node.name, None)
        restoreHelper = CheckpointAndRestoreHelper(
            fileManager.getModelDirAndPrefix(), False, tf.get_default_graph())

        # Now we'll save the model (we don't actually need to train anything)
        with tf.Session() as sess:
            init.run()
            restoreHelper.saveCheckpoint(sess, 0)

            # Now modify the variables
            A_mod.eval()
            B_mod.eval()
            assert A.eval() == 5
            assert B.eval() == 5

        # Now reset the graph and check that the variables are restored after loading the meta
        # graph. To do this we create CheckpointAndRestoreHelper with shouldRestore=True.
        tf.reset_default_graph()
        restoreHelper = CheckpointAndRestoreHelper(
            fileManager.getModelDirAndPrefix(), True, tf.get_default_graph())

        A = tf.get_default_graph().get_tensor_by_name("A:0")
        B = tf.get_default_graph().get_tensor_by_name("B:0")
        with tf.Session() as sess:
            restoreHelper.restoreFromCheckpoint(sess)
            assert A.eval() == 10
            assert B.eval() == 15
Ejemplo n.º 9
0
    def test_WriteLogs(self, request):
        """
        Tests that log files are created in the correct location.
        """
        tf.reset_default_graph()

        MODEL_DIR = request.node.name
        START_DATETIME = datetime.utcnow().strftime("%Y%m%d-%H%M")
        manager = FileManager(MODEL_DIR)
        logHelper = TensorboardLogHelper(manager.getModelDir(),
                                         tf.get_default_graph(), ["summ"],
                                         False)

        with tf.Session() as sess:
            logHelper.writeSummary(sess, [0])

        logHelper.close()

        assert glob.glob(
            str(pathlib.Path.cwd() / "models" / MODEL_DIR / START_DATETIME /
                "events.out.tfevents.") + "*")
Ejemplo n.º 10
0
    def test_SuccessfulRestore(self, request):
        """
        Tests that when restoring from a previous session new events are correctly appended.
        """
        tf.reset_default_graph()

        # Setup our helpers, use a single summary
        MODEL_DIR = request.node.name
        START_DATETIME = datetime.utcnow().strftime("%Y%m%d-%H%M")
        manager = FileManager(MODEL_DIR)
        logHelper = TensorboardLogHelper(manager.getModelDir(),
                                         tf.get_default_graph(), ["test1"],
                                         False)

        # Write once
        with tf.Session() as sess:
            logHelper.writeSummary(sess, [5.0])

        logHelper.close()

        # Now restore (wait one second otherwise the files will have the first one will be overwritten)
        time.sleep(1)
        logHelper = TensorboardLogHelper(manager.getModelDir(),
                                         tf.get_default_graph(), ["test1"],
                                         True)
        # Write again
        with tf.Session() as sess:
            logHelper.setIteration(1)
            logHelper.writeSummary(sess, [6.0])

        logHelper.close()

        # Manually inspect the tensorboard log
        ea = event_accumulator.EventAccumulator(
            str(pathlib.Path.cwd() / "models" / MODEL_DIR / START_DATETIME))
        ea.Reload()
        assert ea.Scalars("TensorboardLogHelper/test1_1")[0].value == 5
        assert ea.Scalars("TensorboardLogHelper/test1_1")[1].value == 6
Ejemplo n.º 11
0
    def test_RestoreDuringRun(self, request):
        """
        Tests if the model can be restored during a training run where we continue using the same
        graph rather than loading the meta graph.
        """

        # Lets add some basic operators to the graph
        tf.reset_default_graph()
        A = tf.Variable(10, dtype=tf.float32)
        B = tf.Variable(15, dtype=tf.float32)
        init = tf.global_variables_initializer()

        # And some ops so that we can mess with the variables
        A_mod = A.assign(5)
        B_mod = B.assign(5)

        # Setup our helpers
        fileManager = FileManager(request.node.name, None)
        restoreHelper = CheckpointAndRestoreHelper(
            fileManager.getModelDirAndPrefix(), False, tf.get_default_graph())

        # Now we'll save the model (we don't actually need to train anything)
        with tf.Session() as sess:
            init.run()
            restoreHelper.saveCheckpoint(sess, 0)

            # Now modify the variables
            A_mod.eval()
            B_mod.eval()
            assert A.eval() == 5
            assert B.eval() == 5

            # Now restore the model and check that the variables are restored
            restoreHelper.restoreFromCheckpoint(sess)
            assert A.eval() == 10
            assert B.eval() == 15
Ejemplo n.º 12
0
class TFRegressor(SKTFWrapper):
    """
    Provides functionality that is common to TF regression models, mainly the training loop.

    Derived classes must:
    - Provide a constructor which calls the constructor of this class
    - Implement a _buildGraph method which assigns a RegressorTensors object to the _tensors member
    - Implement a _buildModelNameStr method which returns a dict of strings describing the model
      type and its hyperparameters
    """

    def __init__(self,
                 learningRate,
                 batchSize,
                 initializer,
                 dropoutRate,
                 restoreFrom,
                 outputLength):

        # Scikit-learn's api demands that parameters in the constructor are assigned to members with
        # exactly the same name otherwise its clone method sets everything to None
        # (see BaseEstimator::get_params)
        self.learningRate = learningRate
        self.batchSize = batchSize
        self.initializer = initializer
        self.dropoutRate = dropoutRate
        self.restoreFrom = restoreFrom
        self.outputLength = outputLength

        self._session = None
        self._graph = tf.Graph()

        self._fileManager = None
        self._allowRestore = restoreFrom is not None

        self._tensors = None
        self._init = None
        self._saver = None

    def _buildGraph(self, numFeatures):
        """
        Build the graph and return a RegressorTensors object which contains the important tensors
        for the graph.

        ** Derived classes should implement this **
        """
        raise NotImplementedError()

    def _restoreGraph(self, graph):
        """
        Use graph.get_tensor_by_name("<name>:0") to collect the important tensors and return a
        RegressorTensors object.

        ** Derived classes should implement this if they intend to support restoration **
        """
        raise NotImplementedError()

    def _buildHyperParamsDict(self) -> Dict[str, str]:
        """
        Return a dict of strings, where the keys are around 1 to 4 character abbreviations of
        hyperparameter names, and the values are the corresponding hyperparameter values.

        ** Derived classes should implement this **
        """
        raise NotImplementedError()

    def _buildModelNameStr(self) -> str:
        """
        Return a url/filename safe string describing the model type and its hyperparameters
        """
        modelName = self.__class__.__name__
        paramsDict = self._buildHyperParamsDict()
        for key in paramsDict.keys():
            modelName += "-" + key + "-" + paramsDict[key]

        return modelName

    def _onEpochComplete(self, numEpoch) -> None:
        """
        If you need to do any processing at the end of each epoch override this method. numEpoch is
        the number of epochs completed starting from 0.
        """
        pass

    def fit(self, X, y, X_valid, y_valid, numEpochs=1):
        """Fits the model on the training set"""
        self._closeSession()

        # This must be initialised during fit for sklearn's grid search to call it at the correct
        # time
        self._fileManager = FileManager(self._buildModelNameStr(), self.restoreFrom)

        with self._graph.as_default():
            if self.restoreFrom is None:
                self._tensors = self._buildGraph(X.shape[1])
                self._init = tf.global_variables_initializer()

        stoppingHelper = EarlyStoppingHelper()
        restoreHelper = CheckpointAndRestoreHelper(self._fileManager.getModelDirAndPrefix(),
                                                   self.restoreFrom is not None,
                                                   self._graph)

        tensorboardHelper = TensorboardLogHelper(self._fileManager.getModelDir(),
                                                 self._graph,
                                                 ["LossTrain", "LossVal", "BatchTimeAvg"],
                                                 self.restoreFrom is not None)

        self._session = tf.Session(graph=self._graph)

        trainingValidator = TrainingValidator(self._graph, self._session)
        with self._session.as_default() as sess:
            if self.restoreFrom is None:
                startEpoch = 0
                self._init.run()
            else:
                startEpoch = restoreHelper.restoreFromCheckpoint(sess)
                tensorboardHelper.setIteration(startEpoch)

                # This call to restore the graph doesn't need to be done inside the session,
                # may be better to move it outside the session
                try:
                    self._tensors = self._restoreGraph(self._graph)
                except KeyError as err:
                    print([n.name for n in self._graph.as_graph_def().node])
                    print("\n" + str(err))
                    print("\nThe available tensors/ops have been printed above this error")
                    raise RuntimeError("Model failed to restore")

            progressCalc = ProgressCalculator(numEpochs - startEpoch)
            progressCalc.start()
            for epoch in range(startEpoch, numEpochs):
                randomIndicies = np.random.permutation(len(X))
                NUM_BATCHES = len(X) // self.batchSize

                batchTimes = []
                for batchNumber, batchIndicies in enumerate(np.array_split(randomIndicies, NUM_BATCHES)):
                    batchStart = time.time()

                    X_batch, y_batch = X[batchIndicies], y[batchIndicies]

                    feed_dict = {self._tensors.X_in: X_batch,
                                 self._tensors.y_in: y_batch,
                                 self._tensors.dropoutKeepProb: 1 - self.dropoutRate}

                    sess.run(self._tensors.trainingOp, feed_dict=feed_dict)

                    batchTimes.append(time.time() - batchStart)
                    print("Batch:", batchNumber, "/", NUM_BATCHES, "{0:.4f}".format(batchTimes[-1]) + "s", end="\r")

                # Calculate and log the losses for this epoch
                lossTrain = self._tensors.loss.eval(feed_dict=feed_dict)
                lossVal = self._evalLossBatched(X_valid, y_valid)
                tensorboardHelper.writeSummary(sess, [lossTrain, lossVal, np.average(batchTimes)])
                progressCalc.updateInterval(1)
                print("\033[K" + "Epoch: {0}\tValidation loss: {1}\tTime Remaining: {2}".format(
                    epoch, lossVal, progressCalc.getTimeStampRemaining()))

                restoreHelper.saveCheckpoint(sess, epoch)

                if epoch < 2:
                    trainingValidator.validate(lossVal)

                self._onEpochComplete(epoch)

                # This must be the last thing done in an epoch
                if stoppingHelper.shouldStop(lossVal):
                    print("Early stopping at epoch: ", epoch)
                    break

            stoppingHelper.restoreBestModelParams()
            tensorboardHelper.close()

        print("Time taken:", progressCalc.timeTaken())

    def predict(self, X):
        """Returns the model's predictions for the provided data"""
        if not self._session:
            raise NotFittedError("This", self.__class__.__name__, "instance is not fitted yet")

        BATCH_SIZE = self.batchSize
        if len(X) < self.batchSize:
            BATCH_SIZE = len(X)

        NUM_BATCHES = len(X) // BATCH_SIZE
        indicies = np.arange(len(X))
        predictions = np.zeros((X.shape[0], self.outputLength))

        with self._session.as_default():
            for batchIndicies in np.array_split(indicies, NUM_BATCHES):
                X_batch = X[batchIndicies, :]
                predictions[batchIndicies, :] = self._tensors.logits.eval(feed_dict={self._tensors.X_in: X_batch})

        return predictions

    def _evalLossBatched(self, X, y):
        """Do validation in batches in case the dataset would need 10's of GB"""
        NUM_BATCHES = len(X) // self.batchSize
        indicies = np.arange(len(X))
        losses = np.zeros(len(X))

        for batchIndicies in np.array_split(indicies, NUM_BATCHES):
            X_batch = X[batchIndicies, :]
            y_batch = y[batchIndicies]

            losses[batchIndicies] = self._tensors.loss.eval(feed_dict={self._tensors.X_in: X_batch,
                                                                       self._tensors.y_in: y_batch})

        return np.average(losses)
Ejemplo n.º 13
0
    def fit(self, X, y, X_valid, y_valid, numEpochs=1):
        """Fits the model on the training set"""
        self._closeSession()

        # This must be initialised during fit for sklearn's grid search to call it at the correct
        # time
        self._fileManager = FileManager(self._buildModelNameStr(), self.restoreFrom)

        with self._graph.as_default():
            if self.restoreFrom is None:
                self._tensors = self._buildGraph(X.shape[1])
                self._init = tf.global_variables_initializer()

        stoppingHelper = EarlyStoppingHelper()
        restoreHelper = CheckpointAndRestoreHelper(self._fileManager.getModelDirAndPrefix(),
                                                   self.restoreFrom is not None,
                                                   self._graph)

        tensorboardHelper = TensorboardLogHelper(self._fileManager.getModelDir(),
                                                 self._graph,
                                                 ["LossTrain", "LossVal", "BatchTimeAvg"],
                                                 self.restoreFrom is not None)

        self._session = tf.Session(graph=self._graph)

        trainingValidator = TrainingValidator(self._graph, self._session)
        with self._session.as_default() as sess:
            if self.restoreFrom is None:
                startEpoch = 0
                self._init.run()
            else:
                startEpoch = restoreHelper.restoreFromCheckpoint(sess)
                tensorboardHelper.setIteration(startEpoch)

                # This call to restore the graph doesn't need to be done inside the session,
                # may be better to move it outside the session
                try:
                    self._tensors = self._restoreGraph(self._graph)
                except KeyError as err:
                    print([n.name for n in self._graph.as_graph_def().node])
                    print("\n" + str(err))
                    print("\nThe available tensors/ops have been printed above this error")
                    raise RuntimeError("Model failed to restore")

            progressCalc = ProgressCalculator(numEpochs - startEpoch)
            progressCalc.start()
            for epoch in range(startEpoch, numEpochs):
                randomIndicies = np.random.permutation(len(X))
                NUM_BATCHES = len(X) // self.batchSize

                batchTimes = []
                for batchNumber, batchIndicies in enumerate(np.array_split(randomIndicies, NUM_BATCHES)):
                    batchStart = time.time()

                    X_batch, y_batch = X[batchIndicies], y[batchIndicies]

                    feed_dict = {self._tensors.X_in: X_batch,
                                 self._tensors.y_in: y_batch,
                                 self._tensors.dropoutKeepProb: 1 - self.dropoutRate}

                    sess.run(self._tensors.trainingOp, feed_dict=feed_dict)

                    batchTimes.append(time.time() - batchStart)
                    print("Batch:", batchNumber, "/", NUM_BATCHES, "{0:.4f}".format(batchTimes[-1]) + "s", end="\r")

                # Calculate and log the losses for this epoch
                lossTrain = self._tensors.loss.eval(feed_dict=feed_dict)
                lossVal = self._evalLossBatched(X_valid, y_valid)
                tensorboardHelper.writeSummary(sess, [lossTrain, lossVal, np.average(batchTimes)])
                progressCalc.updateInterval(1)
                print("\033[K" + "Epoch: {0}\tValidation loss: {1}\tTime Remaining: {2}".format(
                    epoch, lossVal, progressCalc.getTimeStampRemaining()))

                restoreHelper.saveCheckpoint(sess, epoch)

                if epoch < 2:
                    trainingValidator.validate(lossVal)

                self._onEpochComplete(epoch)

                # This must be the last thing done in an epoch
                if stoppingHelper.shouldStop(lossVal):
                    print("Early stopping at epoch: ", epoch)
                    break

            stoppingHelper.restoreBestModelParams()
            tensorboardHelper.close()

        print("Time taken:", progressCalc.timeTaken())