def testTrainWithLocalVariable(self): with ops.Graph().as_default(): random_seed.set_random_seed(0) tf_inputs = constant_op.constant(self._inputs, dtype=dtypes.float32) tf_labels = constant_op.constant(self._labels, dtype=dtypes.float32) local_multiplier = variables_lib.local_variable(1.0) tf_predictions = logistic_classifier(tf_inputs) * local_multiplier losses.log_loss(tf_labels, tf_predictions) total_loss = losses.get_total_loss() optimizer = gradient_descent.GradientDescentOptimizer( learning_rate=1.0) train_op = training.create_train_op(total_loss, optimizer) loss = training.train( train_op, None, hooks=[basic_session_run_hooks.StopAtStepHook(num_steps=300)], save_summaries_steps=None, save_checkpoint_secs=None) self.assertIsNotNone(loss) self.assertLess(loss, .015)
def testResumeTrainAchievesRoughlyTheSameLoss(self): number_of_steps = [300, 1, 5] logdir = os.path.join(self.get_temp_dir(), 'resume_train_same_loss') for i in range(len(number_of_steps)): with ops.Graph().as_default(): random_seed.set_random_seed(i) tf_inputs = constant_op.constant(self._inputs, dtype=dtypes.float32) tf_labels = constant_op.constant(self._labels, dtype=dtypes.float32) tf_predictions = logistic_classifier(tf_inputs) loss_ops.log_loss(tf_predictions, tf_labels) total_loss = loss_ops.get_total_loss() optimizer = gradient_descent.GradientDescentOptimizer(learning_rate=1.0) train_op = training.create_train_op(total_loss, optimizer) saver = saver_lib.Saver() loss = training.train( train_op, logdir, hooks=[ basic_session_run_hooks.StopAtStepHook( num_steps=number_of_steps[i]), basic_session_run_hooks.CheckpointSaverHook( logdir, save_steps=50, saver=saver), ]) self.assertIsNotNone(loss) self.assertLess(loss, .015)
def _train_model(self, checkpoint_dir, num_steps): """Trains a simple classification model. Note that the data has been configured such that after around 300 steps, the model has memorized the dataset (e.g. we can expect %100 accuracy). Args: checkpoint_dir: The directory where the checkpoint is written to. num_steps: The number of steps to train for. """ with ops.Graph().as_default(): random_seed.set_random_seed(0) tf_inputs = constant_op.constant(self._inputs, dtype=dtypes.float32) tf_labels = constant_op.constant(self._labels, dtype=dtypes.float32) tf_predictions = logistic_classifier(tf_inputs) loss = loss_ops.log_loss(tf_predictions, tf_labels) optimizer = gradient_descent.GradientDescentOptimizer( learning_rate=1.0) train_op = training.create_train_op(loss, optimizer) loss = training.train( train_op, checkpoint_dir, hooks=[basic_session_run_hooks.StopAtStepHook(num_steps)])
def testCanAchieveZeroLoss(self): logdir = os.path.join(self.get_temp_dir(), 'can_achieve_zero_loss') with ops.Graph().as_default(): random_seed.set_random_seed(0) tf_inputs = constant_op.constant(self._inputs, dtype=dtypes.float32) tf_labels = constant_op.constant(self._labels, dtype=dtypes.float32) tf_predictions = logistic_classifier(tf_inputs) loss_ops.log_loss(tf_predictions, tf_labels) total_loss = loss_ops.get_total_loss() optimizer = gradient_descent.GradientDescentOptimizer( learning_rate=1.0) train_op = training.create_train_op(total_loss, optimizer) loss = training.train( train_op, logdir, hooks=[basic_session_run_hooks.StopAtStepHook(num_steps=300)]) self.assertIsNotNone(loss) self.assertLess(loss, .015)
def _train_model(self, checkpoint_dir, num_steps): """Trains a simple classification model. Note that the data has been configured such that after around 300 steps, the model has memorized the dataset (e.g. we can expect %100 accuracy). Args: checkpoint_dir: The directory where the checkpoint is written to. num_steps: The number of steps to train for. """ with ops.Graph().as_default(): random_seed.set_random_seed(0) tf_inputs = constant_op.constant(self._inputs, dtype=dtypes.float32) tf_labels = constant_op.constant(self._labels, dtype=dtypes.float32) tf_predictions = logistic_classifier(tf_inputs) loss = loss_ops.log_loss(tf_predictions, tf_labels) optimizer = gradient_descent.GradientDescentOptimizer(learning_rate=1.0) train_op = training.create_train_op(loss, optimizer) loss = training.train( train_op, checkpoint_dir, hooks=[basic_session_run_hooks.StopAtStepHook(num_steps)])
def gan_train( self, train_ops, logdir, get_hooks_fn=get_joint_train_hooks(), master='', is_chief=True, scaffold=None, hooks=None, chief_only_hooks=None, save_checkpoint_secs=600, save_summaries_steps=100, config=None ): new_hooks = get_hooks_fn(train_ops) if hooks is not None: hooks = list(hooks) + list(new_hooks) else: hooks = new_hooks return training.train( train_ops.global_step_inc_op, logdir, master=master, is_chief=is_chief, scaffold=scaffold, hooks=hooks, chief_only_hooks=chief_only_hooks, save_checkpoint_secs=save_checkpoint_secs, save_summaries_steps=save_summaries_steps, config=config)
def testTrainWithAlteredGradients(self): # Use the same learning rate but different gradient multipliers # to train two models. Model with equivalently larger learning # rate (i.e., learning_rate * gradient_multiplier) has smaller # training loss. multipliers = [1., 1000.] number_of_steps = 10 learning_rate = 0.001 # First, train the model with equivalently smaller learning rate. with ops.Graph().as_default(): random_seed.set_random_seed(0) train_op = self.create_train_op( learning_rate=learning_rate, gradient_multiplier=multipliers[0]) loss0 = training.train( train_op, None, hooks=[ basic_session_run_hooks.StopAtStepHook(num_steps=number_of_steps), ], save_checkpoint_secs=None, save_summaries_steps=None) self.assertIsNotNone(loss0) self.assertGreater(loss0, .5) # Second, train the model with equivalently larger learning rate. with ops.Graph().as_default(): random_seed.set_random_seed(0) train_op = self.create_train_op( learning_rate=learning_rate, gradient_multiplier=multipliers[1]) loss1 = training.train( train_op, None, hooks=[ basic_session_run_hooks.StopAtStepHook(num_steps=number_of_steps), ], save_checkpoint_secs=None, save_summaries_steps=None) self.assertIsNotNone(loss1) self.assertLess(loss1, .5) # The loss of the model trained with larger learning rate should # be smaller. self.assertGreater(loss0, loss1)
def testTrainWithAlteredGradients(self): # Use the same learning rate but different gradient multipliers # to train two models. Model with equivalently larger learning # rate (i.e., learning_rate * gradient_multiplier) has smaller # training loss. multipliers = [1., 1000.] number_of_steps = 10 learning_rate = 0.001 # First, train the model with equivalently smaller learning rate. with ops.Graph().as_default(): random_seed.set_random_seed(0) train_op = self.create_train_op(learning_rate=learning_rate, gradient_multiplier=multipliers[0]) loss0 = training.train(train_op, None, hooks=[ basic_session_run_hooks.StopAtStepHook( num_steps=number_of_steps), ], save_checkpoint_secs=None, save_summaries_steps=None) self.assertIsNotNone(loss0) self.assertGreater(loss0, .5) # Second, train the model with equivalently larger learning rate. with ops.Graph().as_default(): random_seed.set_random_seed(0) train_op = self.create_train_op(learning_rate=learning_rate, gradient_multiplier=multipliers[1]) loss1 = training.train(train_op, None, hooks=[ basic_session_run_hooks.StopAtStepHook( num_steps=number_of_steps), ], save_checkpoint_secs=None, save_summaries_steps=None) self.assertIsNotNone(loss1) self.assertLess(loss1, .5) # The loss of the model trained with larger learning rate should # be smaller. self.assertGreater(loss0, loss1)
def gan_train( train_ops, logdir, get_hooks_fn=get_sequential_train_hooks(), master='', is_chief=True, scaffold=None, hooks=None, chief_only_hooks=None, save_checkpoint_secs=600, save_summaries_steps=100, config=None): """A wrapper around `contrib.training.train` that uses GAN hooks. Args: train_ops: A GANTrainOps named tuple. logdir: The directory where the graph and checkpoints are saved. get_hooks_fn: A function that takes a GANTrainOps tuple and returns a list of hooks. master: The URL of the master. is_chief: Specifies whether or not the training is being run by the primary replica during replica training. scaffold: An tf.train.Scaffold instance. hooks: List of `tf.train.SessionRunHook` callbacks which are run inside the training loop. chief_only_hooks: List of `tf.train.SessionRunHook` instances which are run inside the training loop for the chief trainer only. save_checkpoint_secs: The frequency, in seconds, that a checkpoint is saved using a default checkpoint saver. If `save_checkpoint_secs` is set to `None`, then the default checkpoint saver isn't used. save_summaries_steps: The frequency, in number of global steps, that the summaries are written to disk using a default summary saver. If `save_summaries_steps` is set to `None`, then the default summary saver isn't used. config: An instance of `tf.ConfigProto`. Returns: Output of the call to `training.train`. """ new_hooks = get_hooks_fn(train_ops) if hooks is not None: hooks = list(hooks) + list(new_hooks) else: hooks = new_hooks return training.train( train_ops.global_step_inc_op, logdir, master=master, is_chief=is_chief, scaffold=scaffold, hooks=hooks, chief_only_hooks=chief_only_hooks, save_checkpoint_secs=save_checkpoint_secs, save_summaries_steps=save_summaries_steps, config=config)
def testTrainWithNoInitAssignCanAchieveZeroLoss(self): g = ops.Graph() with g.as_default(): random_seed.set_random_seed(0) tf_inputs = constant_op.constant(self._inputs, dtype=dtypes.float32) tf_labels = constant_op.constant(self._labels, dtype=dtypes.float32) tf_predictions = batchnorm_classifier(tf_inputs) loss_ops.log_loss(tf_predictions, tf_labels) total_loss = loss_ops.get_total_loss() optimizer = gradient_descent.GradientDescentOptimizer(learning_rate=1.0) train_op = training.create_train_op(total_loss, optimizer) loss = training.train( train_op, self._logdir, hooks=[basic_session_run_hooks.StopAtStepHook(num_steps=300)]) self.assertLess(loss, .1)
def testCanAchieveZeroLoss(self): with ops.Graph().as_default(): random_seed.set_random_seed(0) tf_inputs = constant_op.constant(self._inputs, dtype=dtypes.float32) tf_labels = constant_op.constant(self._labels, dtype=dtypes.float32) tf_predictions = logistic_classifier(tf_inputs) losses.log_loss(tf_labels, tf_predictions) total_loss = losses.get_total_loss() optimizer = gradient_descent.GradientDescentOptimizer(learning_rate=1.0) train_op = training.create_train_op(total_loss, optimizer) loss = training.train( train_op, None, hooks=[basic_session_run_hooks.StopAtStepHook(num_steps=300)], save_summaries_steps=None, save_checkpoint_secs=None) self.assertIsNotNone(loss) self.assertLess(loss, .015)
def testCanAchieveZeroLoss(self): logdir = os.path.join(self.get_temp_dir(), 'can_achieve_zero_loss') with ops.Graph().as_default(): random_seed.set_random_seed(0) tf_inputs = constant_op.constant(self._inputs, dtype=dtypes.float32) tf_labels = constant_op.constant(self._labels, dtype=dtypes.float32) tf_predictions = logistic_classifier(tf_inputs) loss_ops.log_loss(tf_predictions, tf_labels) total_loss = loss_ops.get_total_loss() optimizer = gradient_descent.GradientDescentOptimizer(learning_rate=1.0) train_op = training.create_train_op(total_loss, optimizer) loss = training.train( train_op, logdir, hooks=[basic_session_run_hooks.StopAtStepHook(num_steps=300)]) self.assertIsNotNone(loss) self.assertLess(loss, .015)
def testTrainWithAlteredGradients(self): # Use the same learning rate but different gradient multipliers # to train two models. Model with equivalently larger learning # rate (i.e., learning_rate * gradient_multiplier) has smaller # training loss. logdir1 = os.path.join(self.get_temp_dir(), 'tmp_logs6/') logdir2 = os.path.join(self.get_temp_dir(), 'tmp_logs7/') if gfile.Exists(logdir1): gfile.DeleteRecursively(logdir1) if gfile.Exists(logdir2): gfile.DeleteRecursively(logdir2) multipliers = [1., 1000.] number_of_steps = 10 losses = [] learning_rate = 0.001 # First, train the model with equivalently smaller learning rate. with ops.Graph().as_default(): random_seed.set_random_seed(0) train_op = self.create_train_op( learning_rate=learning_rate, gradient_multiplier=multipliers[0]) saver = saver_lib.Saver() loss = training.train( train_op, logdir1, hooks=[ basic_session_run_hooks.StopAtStepHook(num_steps=number_of_steps), basic_session_run_hooks.CheckpointSaverHook( logdir1, save_steps=50, saver=saver), ]) losses.append(loss) self.assertGreater(loss, .5) # Second, train the model with equivalently larger learning rate. with ops.Graph().as_default(): random_seed.set_random_seed(0) train_op = self.create_train_op( learning_rate=learning_rate, gradient_multiplier=multipliers[1]) saver = saver_lib.Saver() loss = training.train( train_op, logdir2, hooks=[ basic_session_run_hooks.StopAtStepHook(num_steps=number_of_steps), basic_session_run_hooks.CheckpointSaverHook( logdir2, save_steps=50, saver=saver), ]) losses.append(loss) self.assertIsNotNone(loss) self.assertLess(loss, .5) # The loss of the model trained with larger learning rate should # be smaller. self.assertGreater(losses[0], losses[1])
def testTrainAllVarsHasLowerLossThanTrainSubsetOfVars(self): logdir = os.path.join(self.get_temp_dir(), 'tmp_logs3/') if gfile.Exists(logdir): # For running on jenkins. gfile.DeleteRecursively(logdir) # First, train only the weights of the model. with ops.Graph().as_default(): random_seed.set_random_seed(0) total_loss = self.ModelLoss() optimizer = gradient_descent.GradientDescentOptimizer(learning_rate=1.0) weights = variables_lib.get_variables_by_name('weights') train_op = training.create_train_op( total_loss, optimizer, variables_to_train=weights) saver = saver_lib.Saver() loss = training.train( train_op, logdir, hooks=[ basic_session_run_hooks.CheckpointSaverHook( logdir, save_steps=1, saver=saver), basic_session_run_hooks.StopAtStepHook(num_steps=200), ]) self.assertGreater(loss, .015) self.assertLess(loss, .05) # Next, train the biases of the model. with ops.Graph().as_default(): random_seed.set_random_seed(1) total_loss = self.ModelLoss() optimizer = gradient_descent.GradientDescentOptimizer(learning_rate=1.0) biases = variables_lib.get_variables_by_name('biases') train_op = training.create_train_op( total_loss, optimizer, variables_to_train=biases) saver = saver_lib.Saver() loss = training.train( train_op, logdir, hooks=[ basic_session_run_hooks.CheckpointSaverHook( logdir, save_steps=1, saver=saver), basic_session_run_hooks.StopAtStepHook(num_steps=300), ]) self.assertGreater(loss, .015) self.assertLess(loss, .05) # Finally, train both weights and bias to get lower loss. with ops.Graph().as_default(): random_seed.set_random_seed(2) total_loss = self.ModelLoss() optimizer = gradient_descent.GradientDescentOptimizer(learning_rate=1.0) train_op = training.create_train_op(total_loss, optimizer) saver = saver_lib.Saver() loss = training.train( train_op, logdir, hooks=[ basic_session_run_hooks.CheckpointSaverHook( logdir, save_steps=1, saver=saver), basic_session_run_hooks.StopAtStepHook(num_steps=400), ]) self.assertIsNotNone(loss) self.assertLess(loss, .015)
def testTrainWithInitFromCheckpoint(self): logdir1 = os.path.join(self.get_temp_dir(), 'tmp_logs1/') logdir2 = os.path.join(self.get_temp_dir(), 'tmp_logs2/') if gfile.Exists(logdir1): # For running on jenkins. gfile.DeleteRecursively(logdir1) if gfile.Exists(logdir2): # For running on jenkins. gfile.DeleteRecursively(logdir2) # First, train the model one step (make sure the error is high). with ops.Graph().as_default(): random_seed.set_random_seed(0) train_op = self.create_train_op() saver = saver_lib.Saver() loss = training.train( train_op, logdir1, hooks=[ basic_session_run_hooks.CheckpointSaverHook( logdir1, save_steps=1, saver=saver), basic_session_run_hooks.StopAtStepHook(num_steps=1), ], save_checkpoint_secs=None) self.assertGreater(loss, .5) # Next, train the model to convergence. with ops.Graph().as_default(): random_seed.set_random_seed(1) train_op = self.create_train_op() saver = saver_lib.Saver() loss = training.train( train_op, logdir1, hooks=[ basic_session_run_hooks.CheckpointSaverHook( logdir1, save_steps=1, saver=saver), basic_session_run_hooks.StopAtStepHook(num_steps=300), ], save_checkpoint_secs=None) self.assertIsNotNone(loss) self.assertLess(loss, .02) # Finally, advance the model a single step and validate that the loss is # still low. with ops.Graph().as_default(): random_seed.set_random_seed(2) train_op = self.create_train_op() model_variables = variables_lib2.global_variables() model_path = os.path.join(logdir1, 'model.ckpt-300') assign_fn = variables_lib.assign_from_checkpoint_fn(model_path, model_variables) def init_fn(_, session): assign_fn(session) loss = training.train( train_op, logdir2, scaffold=monitored_session.Scaffold(init_fn=init_fn), hooks=[basic_session_run_hooks.StopAtStepHook(num_steps=1)]) self.assertIsNotNone(loss) self.assertLess(loss, .02)
def main(_): utils.create_folder(folder_list=[FLAGS.dataset_dir, FLAGS.train_log_dir]) # Force all input processing onto CPU in order to reserve the GPU for # the forward inference and back-propagation. with tf.name_scope('inputs'): with tf.device('/cpu:0'): input = ginput.Input() input_a, input_b = input.get_batch(FLAGS.batch_size, FLAGS.img_size, FLAGS.channels) labels = tf.cast(tf.random_uniform([FLAGS.batch_size, 1], minval=0, maxval=2, dtype=tf.int32), dtype=tf.float32) # Define the Model model = gmodel.Model(model_type=FLAGS.model_type, is_train=True, grid_size=FLAGS.grid_size) model.build(img_a=input_a, img_b=input_b, labels=labels) # Define Loss loss_fn = gloss.Loss(model=model, weight_decay_coeff=FLAGS.weight_decay_coeff) # Get global step and the respective operation global_step_inc = utils.create_global_step() # Create training operation train_op = gtrain_op.TrainOp(model=model, loss_fn=loss_fn, learning_rate=FLAGS.learning_rate, decay_steps=FLAGS.decay_steps, beta=FLAGS.beta, staircase=FLAGS.staircase) op = train_op.create_train_op() status_message = tf.string_join([ 'Starting train step: ', tf.as_string(tf.train.get_or_create_global_step()) ], name='status_message') hooks = [ tf.train.LoggingTensorHook([status_message], every_n_iter=10), tf.train.StopAtStepHook(num_steps=FLAGS.max_number_of_steps), utils.RunTrainOpsHook(train_ops=op) ] training.train(train_op=global_step_inc, logdir=FLAGS.train_log_dir, master='', is_chief=True, scaffold=None, hooks=hooks, chief_only_hooks=None, save_checkpoint_secs=FLAGS.save_checkpoint_secs, save_summaries_steps=FLAGS.save_summaries_steps, config=None, max_wait_secs=7200)