def test_RestoreIncorrectSummaries(self, request): """ Tests that an exception is thrown if a summary name is provided which couldn't be found in the graph. This could happen where a one of the summary names provided to the constructor Didn't exist in the previous run. """ tf.reset_default_graph() # Setup our helpers, use a single summary MODEL_DIR = request.node.name START_DATETIME = datetime.utcnow().strftime("%Y%m%d-%H%M") manager = FileManager(MODEL_DIR) logHelper = TensorboardLogHelper(manager.getModelDir(), tf.get_default_graph(), ["test1"], False) # Write once with tf.Session() as sess: logHelper.writeSummary(sess, [5.0]) logHelper.close() # Now restore, but using an incorrect session name with pytest.raises(KeyError): logHelper = TensorboardLogHelper(manager.getModelDir(), tf.get_default_graph(), ["test2"], True)
def test_FileCreation(self, request): """ Tests if the correct files are generated when a model is saved. """ # Lets add some basic operators to the graph tf.reset_default_graph() A = tf.Variable(10, dtype=tf.float32) B = tf.Variable(15, dtype=tf.float32) init = tf.global_variables_initializer() # Setup our helpers MODEL_DIR = request.node.name START_DATETIME = datetime.utcnow().strftime("%Y%m%d-%H%M") fileManager = FileManager(MODEL_DIR, None) restoreHelper = CheckpointAndRestoreHelper( fileManager.getModelDirAndPrefix(), False, tf.get_default_graph()) # Now we'll save the model (we don't actually need to train anything) with tf.Session() as sess: init.run() restoreHelper.saveCheckpoint(sess, 0) # Check all files are created for filename in [ "checkpoint", "model.ckpt.data-00000-of-00001", "model.ckpt.epoch", "model.ckpt.index", "model.ckpt.meta" ]: assert os.path.isfile( str(pathlib.Path.cwd() / "models" / MODEL_DIR / START_DATETIME / filename))
def test_WriteStaticAndDynamicSummaries(self, request): """ Tests that both static and dynamic summaries are written for several iterations. """ tf.reset_default_graph() # Build a simple graph with one statically defined summary A = tf.Variable(10, dtype=tf.float32, name="A") tf.summary.scalar("static", A) init = tf.global_variables_initializer() # Setup our helpers, use a single dynamic summary MODEL_DIR = request.node.name START_DATETIME = datetime.utcnow().strftime("%Y%m%d-%H%M") manager = FileManager(MODEL_DIR) logHelper = TensorboardLogHelper(manager.getModelDir(), tf.get_default_graph(), ["dynamic"], False) with tf.Session() as sess: init.run() logHelper.writeSummary(sess, [5.0]) logHelper.close() # Manually inspect the tensorboard log ea = event_accumulator.EventAccumulator( str(pathlib.Path.cwd() / "models" / MODEL_DIR / START_DATETIME)) ea.Reload() assert ea.Scalars("static")[0].value == 10 assert ea.Scalars("TensorboardLogHelper/dynamic_1")[0].value == 5
def test_RestoreMetaFail(self, request): """ Tests if the model fails to restore correctly when there is no meta graph to restore from. """ fileManager = FileManager(request.node.name, None) with pytest.raises(OSError): restoreHelper = CheckpointAndRestoreHelper( fileManager.getModelDirAndPrefix(), True, tf.get_default_graph())
def test_WithRestore(self): """ Tests if the correct model paths are generated if a model to restore from is provided. """ RESTORE_FROM = "20171225-1200" manager = FileManager("TestRegressor", RESTORE_FROM) assert manager.getModelDir( ) == os.getcwd() + "/models/TestRegressor/" + RESTORE_FROM assert manager.getModelDirAndPrefix( ) == os.getcwd() + "/models/TestRegressor/" + RESTORE_FROM + "/model"
def test_WithoutRestore(self): """ Tests if the correct model paths are generated if no model to restore from is provided. """ manager = FileManager("TestRegressor") TIMESTAMP = datetime.utcnow().strftime("%Y%m%d-%H%M") assert manager.getModelDir( ) == os.getcwd() + "/models/TestRegressor/" + TIMESTAMP assert manager.getModelDirAndPrefix( ) == os.getcwd() + "/models/TestRegressor/" + TIMESTAMP + "/model"
def test_IncorrectSummaries(self, request): """ Tests that providing an incorrect number of summary values causes an exception. """ tf.reset_default_graph() manager = FileManager(request.node.name) logHelper = TensorboardLogHelper(manager.getModelDir(), tf.get_default_graph(), ["summ"], False) with tf.Session() as sess: with pytest.raises(ValueError): logHelper.writeSummary(sess, [0, 1]) logHelper.close()
def test_RestoreFreshRun(self, request): """ Tests if the model can be restored during a training run where we reload the meta graph. This would be the most common use case, where the process was been interrupted and we have to load the graph from files. """ # Lets add some basic operators to the graph tf.reset_default_graph() A = tf.Variable(10, dtype=tf.float32, name="A") B = tf.Variable(15, dtype=tf.float32, name="B") init = tf.global_variables_initializer() # And some ops so that we can mess with the variables A_mod = A.assign(5) B_mod = B.assign(5) # Setup our helpers fileManager = FileManager(request.node.name, None) restoreHelper = CheckpointAndRestoreHelper( fileManager.getModelDirAndPrefix(), False, tf.get_default_graph()) # Now we'll save the model (we don't actually need to train anything) with tf.Session() as sess: init.run() restoreHelper.saveCheckpoint(sess, 0) # Now modify the variables A_mod.eval() B_mod.eval() assert A.eval() == 5 assert B.eval() == 5 # Now reset the graph and check that the variables are restored after loading the meta # graph. To do this we create CheckpointAndRestoreHelper with shouldRestore=True. tf.reset_default_graph() restoreHelper = CheckpointAndRestoreHelper( fileManager.getModelDirAndPrefix(), True, tf.get_default_graph()) A = tf.get_default_graph().get_tensor_by_name("A:0") B = tf.get_default_graph().get_tensor_by_name("B:0") with tf.Session() as sess: restoreHelper.restoreFromCheckpoint(sess) assert A.eval() == 10 assert B.eval() == 15
def test_WriteLogs(self, request): """ Tests that log files are created in the correct location. """ tf.reset_default_graph() MODEL_DIR = request.node.name START_DATETIME = datetime.utcnow().strftime("%Y%m%d-%H%M") manager = FileManager(MODEL_DIR) logHelper = TensorboardLogHelper(manager.getModelDir(), tf.get_default_graph(), ["summ"], False) with tf.Session() as sess: logHelper.writeSummary(sess, [0]) logHelper.close() assert glob.glob( str(pathlib.Path.cwd() / "models" / MODEL_DIR / START_DATETIME / "events.out.tfevents.") + "*")
def test_SuccessfulRestore(self, request): """ Tests that when restoring from a previous session new events are correctly appended. """ tf.reset_default_graph() # Setup our helpers, use a single summary MODEL_DIR = request.node.name START_DATETIME = datetime.utcnow().strftime("%Y%m%d-%H%M") manager = FileManager(MODEL_DIR) logHelper = TensorboardLogHelper(manager.getModelDir(), tf.get_default_graph(), ["test1"], False) # Write once with tf.Session() as sess: logHelper.writeSummary(sess, [5.0]) logHelper.close() # Now restore (wait one second otherwise the files will have the first one will be overwritten) time.sleep(1) logHelper = TensorboardLogHelper(manager.getModelDir(), tf.get_default_graph(), ["test1"], True) # Write again with tf.Session() as sess: logHelper.setIteration(1) logHelper.writeSummary(sess, [6.0]) logHelper.close() # Manually inspect the tensorboard log ea = event_accumulator.EventAccumulator( str(pathlib.Path.cwd() / "models" / MODEL_DIR / START_DATETIME)) ea.Reload() assert ea.Scalars("TensorboardLogHelper/test1_1")[0].value == 5 assert ea.Scalars("TensorboardLogHelper/test1_1")[1].value == 6
def test_RestoreDuringRun(self, request): """ Tests if the model can be restored during a training run where we continue using the same graph rather than loading the meta graph. """ # Lets add some basic operators to the graph tf.reset_default_graph() A = tf.Variable(10, dtype=tf.float32) B = tf.Variable(15, dtype=tf.float32) init = tf.global_variables_initializer() # And some ops so that we can mess with the variables A_mod = A.assign(5) B_mod = B.assign(5) # Setup our helpers fileManager = FileManager(request.node.name, None) restoreHelper = CheckpointAndRestoreHelper( fileManager.getModelDirAndPrefix(), False, tf.get_default_graph()) # Now we'll save the model (we don't actually need to train anything) with tf.Session() as sess: init.run() restoreHelper.saveCheckpoint(sess, 0) # Now modify the variables A_mod.eval() B_mod.eval() assert A.eval() == 5 assert B.eval() == 5 # Now restore the model and check that the variables are restored restoreHelper.restoreFromCheckpoint(sess) assert A.eval() == 10 assert B.eval() == 15
class TFRegressor(SKTFWrapper): """ Provides functionality that is common to TF regression models, mainly the training loop. Derived classes must: - Provide a constructor which calls the constructor of this class - Implement a _buildGraph method which assigns a RegressorTensors object to the _tensors member - Implement a _buildModelNameStr method which returns a dict of strings describing the model type and its hyperparameters """ def __init__(self, learningRate, batchSize, initializer, dropoutRate, restoreFrom, outputLength): # Scikit-learn's api demands that parameters in the constructor are assigned to members with # exactly the same name otherwise its clone method sets everything to None # (see BaseEstimator::get_params) self.learningRate = learningRate self.batchSize = batchSize self.initializer = initializer self.dropoutRate = dropoutRate self.restoreFrom = restoreFrom self.outputLength = outputLength self._session = None self._graph = tf.Graph() self._fileManager = None self._allowRestore = restoreFrom is not None self._tensors = None self._init = None self._saver = None def _buildGraph(self, numFeatures): """ Build the graph and return a RegressorTensors object which contains the important tensors for the graph. ** Derived classes should implement this ** """ raise NotImplementedError() def _restoreGraph(self, graph): """ Use graph.get_tensor_by_name("<name>:0") to collect the important tensors and return a RegressorTensors object. ** Derived classes should implement this if they intend to support restoration ** """ raise NotImplementedError() def _buildHyperParamsDict(self) -> Dict[str, str]: """ Return a dict of strings, where the keys are around 1 to 4 character abbreviations of hyperparameter names, and the values are the corresponding hyperparameter values. ** Derived classes should implement this ** """ raise NotImplementedError() def _buildModelNameStr(self) -> str: """ Return a url/filename safe string describing the model type and its hyperparameters """ modelName = self.__class__.__name__ paramsDict = self._buildHyperParamsDict() for key in paramsDict.keys(): modelName += "-" + key + "-" + paramsDict[key] return modelName def _onEpochComplete(self, numEpoch) -> None: """ If you need to do any processing at the end of each epoch override this method. numEpoch is the number of epochs completed starting from 0. """ pass def fit(self, X, y, X_valid, y_valid, numEpochs=1): """Fits the model on the training set""" self._closeSession() # This must be initialised during fit for sklearn's grid search to call it at the correct # time self._fileManager = FileManager(self._buildModelNameStr(), self.restoreFrom) with self._graph.as_default(): if self.restoreFrom is None: self._tensors = self._buildGraph(X.shape[1]) self._init = tf.global_variables_initializer() stoppingHelper = EarlyStoppingHelper() restoreHelper = CheckpointAndRestoreHelper(self._fileManager.getModelDirAndPrefix(), self.restoreFrom is not None, self._graph) tensorboardHelper = TensorboardLogHelper(self._fileManager.getModelDir(), self._graph, ["LossTrain", "LossVal", "BatchTimeAvg"], self.restoreFrom is not None) self._session = tf.Session(graph=self._graph) trainingValidator = TrainingValidator(self._graph, self._session) with self._session.as_default() as sess: if self.restoreFrom is None: startEpoch = 0 self._init.run() else: startEpoch = restoreHelper.restoreFromCheckpoint(sess) tensorboardHelper.setIteration(startEpoch) # This call to restore the graph doesn't need to be done inside the session, # may be better to move it outside the session try: self._tensors = self._restoreGraph(self._graph) except KeyError as err: print([n.name for n in self._graph.as_graph_def().node]) print("\n" + str(err)) print("\nThe available tensors/ops have been printed above this error") raise RuntimeError("Model failed to restore") progressCalc = ProgressCalculator(numEpochs - startEpoch) progressCalc.start() for epoch in range(startEpoch, numEpochs): randomIndicies = np.random.permutation(len(X)) NUM_BATCHES = len(X) // self.batchSize batchTimes = [] for batchNumber, batchIndicies in enumerate(np.array_split(randomIndicies, NUM_BATCHES)): batchStart = time.time() X_batch, y_batch = X[batchIndicies], y[batchIndicies] feed_dict = {self._tensors.X_in: X_batch, self._tensors.y_in: y_batch, self._tensors.dropoutKeepProb: 1 - self.dropoutRate} sess.run(self._tensors.trainingOp, feed_dict=feed_dict) batchTimes.append(time.time() - batchStart) print("Batch:", batchNumber, "/", NUM_BATCHES, "{0:.4f}".format(batchTimes[-1]) + "s", end="\r") # Calculate and log the losses for this epoch lossTrain = self._tensors.loss.eval(feed_dict=feed_dict) lossVal = self._evalLossBatched(X_valid, y_valid) tensorboardHelper.writeSummary(sess, [lossTrain, lossVal, np.average(batchTimes)]) progressCalc.updateInterval(1) print("\033[K" + "Epoch: {0}\tValidation loss: {1}\tTime Remaining: {2}".format( epoch, lossVal, progressCalc.getTimeStampRemaining())) restoreHelper.saveCheckpoint(sess, epoch) if epoch < 2: trainingValidator.validate(lossVal) self._onEpochComplete(epoch) # This must be the last thing done in an epoch if stoppingHelper.shouldStop(lossVal): print("Early stopping at epoch: ", epoch) break stoppingHelper.restoreBestModelParams() tensorboardHelper.close() print("Time taken:", progressCalc.timeTaken()) def predict(self, X): """Returns the model's predictions for the provided data""" if not self._session: raise NotFittedError("This", self.__class__.__name__, "instance is not fitted yet") BATCH_SIZE = self.batchSize if len(X) < self.batchSize: BATCH_SIZE = len(X) NUM_BATCHES = len(X) // BATCH_SIZE indicies = np.arange(len(X)) predictions = np.zeros((X.shape[0], self.outputLength)) with self._session.as_default(): for batchIndicies in np.array_split(indicies, NUM_BATCHES): X_batch = X[batchIndicies, :] predictions[batchIndicies, :] = self._tensors.logits.eval(feed_dict={self._tensors.X_in: X_batch}) return predictions def _evalLossBatched(self, X, y): """Do validation in batches in case the dataset would need 10's of GB""" NUM_BATCHES = len(X) // self.batchSize indicies = np.arange(len(X)) losses = np.zeros(len(X)) for batchIndicies in np.array_split(indicies, NUM_BATCHES): X_batch = X[batchIndicies, :] y_batch = y[batchIndicies] losses[batchIndicies] = self._tensors.loss.eval(feed_dict={self._tensors.X_in: X_batch, self._tensors.y_in: y_batch}) return np.average(losses)
def fit(self, X, y, X_valid, y_valid, numEpochs=1): """Fits the model on the training set""" self._closeSession() # This must be initialised during fit for sklearn's grid search to call it at the correct # time self._fileManager = FileManager(self._buildModelNameStr(), self.restoreFrom) with self._graph.as_default(): if self.restoreFrom is None: self._tensors = self._buildGraph(X.shape[1]) self._init = tf.global_variables_initializer() stoppingHelper = EarlyStoppingHelper() restoreHelper = CheckpointAndRestoreHelper(self._fileManager.getModelDirAndPrefix(), self.restoreFrom is not None, self._graph) tensorboardHelper = TensorboardLogHelper(self._fileManager.getModelDir(), self._graph, ["LossTrain", "LossVal", "BatchTimeAvg"], self.restoreFrom is not None) self._session = tf.Session(graph=self._graph) trainingValidator = TrainingValidator(self._graph, self._session) with self._session.as_default() as sess: if self.restoreFrom is None: startEpoch = 0 self._init.run() else: startEpoch = restoreHelper.restoreFromCheckpoint(sess) tensorboardHelper.setIteration(startEpoch) # This call to restore the graph doesn't need to be done inside the session, # may be better to move it outside the session try: self._tensors = self._restoreGraph(self._graph) except KeyError as err: print([n.name for n in self._graph.as_graph_def().node]) print("\n" + str(err)) print("\nThe available tensors/ops have been printed above this error") raise RuntimeError("Model failed to restore") progressCalc = ProgressCalculator(numEpochs - startEpoch) progressCalc.start() for epoch in range(startEpoch, numEpochs): randomIndicies = np.random.permutation(len(X)) NUM_BATCHES = len(X) // self.batchSize batchTimes = [] for batchNumber, batchIndicies in enumerate(np.array_split(randomIndicies, NUM_BATCHES)): batchStart = time.time() X_batch, y_batch = X[batchIndicies], y[batchIndicies] feed_dict = {self._tensors.X_in: X_batch, self._tensors.y_in: y_batch, self._tensors.dropoutKeepProb: 1 - self.dropoutRate} sess.run(self._tensors.trainingOp, feed_dict=feed_dict) batchTimes.append(time.time() - batchStart) print("Batch:", batchNumber, "/", NUM_BATCHES, "{0:.4f}".format(batchTimes[-1]) + "s", end="\r") # Calculate and log the losses for this epoch lossTrain = self._tensors.loss.eval(feed_dict=feed_dict) lossVal = self._evalLossBatched(X_valid, y_valid) tensorboardHelper.writeSummary(sess, [lossTrain, lossVal, np.average(batchTimes)]) progressCalc.updateInterval(1) print("\033[K" + "Epoch: {0}\tValidation loss: {1}\tTime Remaining: {2}".format( epoch, lossVal, progressCalc.getTimeStampRemaining())) restoreHelper.saveCheckpoint(sess, epoch) if epoch < 2: trainingValidator.validate(lossVal) self._onEpochComplete(epoch) # This must be the last thing done in an epoch if stoppingHelper.shouldStop(lossVal): print("Early stopping at epoch: ", epoch) break stoppingHelper.restoreBestModelParams() tensorboardHelper.close() print("Time taken:", progressCalc.timeTaken())