def testUpdateClipCoeff(self): with tf.Graph().as_default(), self.test_session() as sess: grads_and_vars = [(tf.constant([[1., 2.], [3., 4.]]), None), (tf.constant([[2., 3.], [4., 5.]]), None)] pgrads_and_vars = [(tf.constant([[3., 4.], [5., 6.]]), None), (tf.constant([[7., 8.], [9., 10.]]), None)] lrate = 0.1 # Note: without rescaling, the squared Fisher norm of the update # is 1.74 # If the update already satisfies the norm constraint, there should # be no rescaling. opt = optimizer.KfacOptimizer(lrate, 0.2, dummy_layer_collection(), 0.3, norm_constraint=10., name='KFAC_1') coeff = opt._update_clip_coeff(grads_and_vars, pgrads_and_vars) self.assertAlmostEqual(1., sess.run(coeff), places=5) # If the update violates the constraint, it should be rescaled to # be on the constraint boundary. opt = optimizer.KfacOptimizer(lrate, 0.2, dummy_layer_collection(), 0.3, norm_constraint=0.5, name='KFAC_2') coeff = opt._update_clip_coeff(grads_and_vars, pgrads_and_vars) sq_norm_pgrad = opt._squared_fisher_norm(grads_and_vars, pgrads_and_vars) sq_norm_update = lrate**2 * coeff**2 * sq_norm_pgrad self.assertAlmostEqual(0.5, sess.run(sq_norm_update), places=5)
def testOptimizerInit(self): with tf.Graph().as_default(): layer_collection = lc.LayerCollection() inputs = tf.ones((2, 1)) * 2 weights_val = np.ones((1, 1), dtype=np.float32) * 3. weights = tf.get_variable('w', initializer=tf.constant(weights_val)) bias = tf.get_variable('b', initializer=tf.zeros_initializer(), shape=(1, 1)) output = tf.matmul(inputs, weights) + bias layer_collection.register_fully_connected((weights, bias), inputs, output) logits = tf.tanh(output) targets = tf.constant([[0.], [1.]]) output = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=targets)) layer_collection.register_categorical_predictive_distribution( logits) optimizer.KfacOptimizer(0.1, 0.2, layer_collection, 0.3, momentum=0.5, momentum_type='regular')
def testOptimizerInitInvalidMomentumRegistration(self): with self.assertRaises(ValueError): optimizer.KfacOptimizer(0.1, 0.2, lc.LayerCollection(), 0.3, momentum_type='foo')
def testSquaredFisherNorm(self): with tf.Graph().as_default(), self.test_session() as sess: grads_and_vars = [(tf.constant([[1., 2.], [3., 4.]]), None), (tf.constant([[2., 3.], [4., 5.]]), None)] pgrads_and_vars = [(tf.constant([[3., 4.], [5., 6.]]), None), (tf.constant([[7., 8.], [9., 10.]]), None)] opt = optimizer.KfacOptimizer(0.1, 0.2, dummy_layer_collection(), 0.3) sq_norm = opt._squared_fisher_norm(grads_and_vars, pgrads_and_vars) self.assertAlmostEqual(174., sess.run(sq_norm), places=5)
def testApplyGradients(self): with tf.Graph().as_default(), self.test_session() as sess: layer_collection = lc.LayerCollection() inputs = tf.ones((2, 1)) * 2 weights_val = np.ones((1, 1), dtype=np.float32) * 3. weights = tf.get_variable('w', initializer=tf.constant(weights_val)) bias = tf.get_variable('b', initializer=tf.zeros_initializer(), shape=(1, 1)) output = tf.matmul(inputs, weights) + bias layer_collection.register_fully_connected((weights, bias), inputs, output) logits = tf.tanh(output) targets = tf.constant([[0.], [1.]]) output = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=targets)) layer_collection.register_categorical_predictive_distribution( logits) opt = optimizer.KfacOptimizer(0.1, 0.2, layer_collection, 0.3, momentum=0.5, momentum_type='regular') (cov_update_thunks, inv_update_thunks) = opt.make_vars_and_create_op_thunks() cov_update_ops = tuple(thunk() for thunk in cov_update_thunks) inv_update_ops = tuple(thunk() for thunk in inv_update_thunks) grads_and_vars = opt.compute_gradients(output, [weights, bias]) all_vars = [grad_and_var[1] for grad_and_var in grads_and_vars] op = opt.apply_gradients(grads_and_vars) sess.run(tf.global_variables_initializer()) old_vars = sess.run(all_vars) sess.run(cov_update_ops) sess.run(inv_update_ops) sess.run(op) new_vars = sess.run(all_vars) for old_var, new_var in zip(old_vars, new_vars): self.assertNotEqual(old_var, new_var)
def testUpdateVelocities(self): with tf.Graph().as_default(), self.test_session() as sess: layers = lc.LayerCollection() layers.register_categorical_predictive_distribution( tf.constant([1.0])) opt = optimizer.KfacOptimizer(0.1, 0.2, layers, 0.3, momentum=0.5, momentum_type='regular') x = tf.get_variable('x', initializer=tf.ones((2, 2))) y = tf.get_variable('y', initializer=tf.ones((2, 2)) * 2) vec1 = tf.ones((2, 2)) * 3 vec2 = tf.ones((2, 2)) * 4 model_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES) update_op = opt._update_velocities([(vec1, x), (vec2, y)], 0.5) opt_vars = [ v for v in tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES) if v not in model_vars ] sess.run(tf.global_variables_initializer()) old_opt_vars = sess.run(opt_vars) # Optimizer vars start out at 0. for opt_var in old_opt_vars: self.assertAllEqual(sess.run(tf.zeros_like(opt_var)), opt_var) sess.run(update_op) new_opt_vars = sess.run(opt_vars) # After one update, the velocities are equal to the vectors. for vec, opt_var in zip([vec1, vec2], new_opt_vars): self.assertAllEqual(sess.run(vec), opt_var) sess.run(update_op) final_opt_vars = sess.run(opt_vars) for first, second in zip(new_opt_vars, final_opt_vars): self.assertFalse(np.equal(first, second).all())
def DISABLED_test_rnn_multi(self): """Test automatic registration on a static RNN. The model tested here is designed for MNIST classification. To classify images using a recurrent neural network, we consider every image row as a sequence of pixels. Because MNIST image shape is 28*28px, we will then handle 28 sequences of 28 steps for every sample. """ with tf.Graph().as_default(): dtype = tf.float32 n_input = 28 # MNIST data input (img shape: 28*28) n_timesteps = 28 # timesteps n_hidden = 128 # hidden layer num of features n_classes = 10 # MNIST total classes (0-9 digits) x = tf.placeholder(dtype, [None, n_timesteps, n_input]) y = tf.placeholder(tf.int32, [None]) x_unstack = tf.unstack(x, n_timesteps, 1) w_input = tf.get_variable( 'w_input', shape=[n_input, n_hidden], dtype=dtype) b_input = tf.get_variable('b_input', shape=[n_hidden], dtype=dtype) w_recurrent = tf.get_variable( 'w_recurrent', shape=[n_hidden, n_hidden], dtype=dtype) b_recurrent = tf.get_variable( 'b_recurrent', shape=[n_hidden], dtype=dtype) w_output = tf.get_variable( 'w_output', shape=[n_hidden, n_classes], dtype=dtype) b_output = tf.get_variable('b_output', shape=[n_classes], dtype=dtype) layer_collection_manual = lc.LayerCollection() layer_collection_auto = lc.LayerCollection() a = tf.zeros([tf.shape(x_unstack[0])[0], n_hidden], dtype=dtype) # Here 'a' are the activations, 's' the pre-activations. a_list = [a] s_input_list = [] s_recurrent_list = [] s_list = [] s_out_list = [] cost = 0.0 for i in range(len(x_unstack)): input_ = x_unstack[i] s_in = tf.matmul(input_, w_input) + b_input s_rec = tf.matmul(a, w_recurrent) + b_recurrent s = s_in + s_rec s_input_list.append(s_in) s_recurrent_list.append(s_rec) s_list.append(s) a = tf.tanh(s) a_list.append(a) s_out = tf.matmul(a, w_output) + b_output s_out_list.append(s_out) if i == len(x_unstack) - 1: labels = y else: labels = tf.zeros([tf.shape(y)[0]], dtype=tf.int32) cost += tf.reduce_mean( tf.nn.sparse_softmax_cross_entropy_with_logits( logits=s_out, labels=labels)) layer_collection_manual.register_categorical_predictive_distribution( s_out) layer_collection_auto.register_categorical_predictive_distribution( s_out) layer_collection_manual.register_fully_connected_multi( (w_input, b_input), x_unstack, s_input_list) layer_collection_manual.register_fully_connected_multi( (w_recurrent, b_recurrent), a_list[:-1], s_recurrent_list) layer_collection_manual.register_fully_connected_multi( (w_output, b_output), a_list[1:], s_out_list) # Constructing the optimizer performs automatic layer registration. auto_optimizer = optimizer.KfacOptimizer( # pylint: disable=unused-variable learning_rate=1, cov_ema_decay=1, damping=1, layer_collection=layer_collection_auto, momentum=1) assert_fisher_blocks_match(self, layer_collection_manual, layer_collection_auto)
def test_multitower_examples_model(self): """Ensure graph search runs properly on a multitower setup. This test uses linear_model from examples/convnets. """ with tf.Graph().as_default(): def linear_model(images, labels, num_classes): """Creates a linear model. Args: images: The input image tensors, a tensor of size (batch_size x height_in x width_in x channels). labels: The sparse target labels, a tensor of size (batch_size x 1). num_classes: The number of classes, needed for one-hot encoding (int). Returns: loss: The total loss for this model (0-D tensor). logits: Predictions for this model (batch_size x num_classes). """ images = tf.reshape(images, [images.shape[0], -1]) logits = tf.layers.dense(images, num_classes, name='logits') loss = sparse_softmax_cross_entropy(labels, logits, num_classes) return loss, logits model = linear_model layer_collection = lc.LayerCollection() num_towers = 2 batch_size = num_towers num_classes = 2 # Set up data. images = tf.random_uniform(shape=[batch_size, 32, 32, 1]) labels = tf.random_uniform( dtype=tf.int64, shape=[batch_size, 1], maxval=num_classes) tower_images = tf.split(images, num_towers) tower_labels = tf.split(labels, num_towers) # Build model. losses = [] logits = [] for tower_id in range(num_towers): tower_name = 'tower%d' % tower_id with tf.name_scope(tower_name): with tf.variable_scope(tf.get_variable_scope(), reuse=(tower_id > 0)): current_loss, current_logits = model( tower_images[tower_id], tower_labels[tower_id], num_classes + 1) layer_collection.register_categorical_predictive_distribution( current_logits, name='logits') losses.append(current_loss) logits.append(current_logits) # Run the graph scanner. with tf.variable_scope(tf.get_variable_scope(), reuse=tf.AUTO_REUSE): gs.register_layers(layer_collection, tf.trainable_variables()) self.assertEqual(len(layer_collection.fisher_blocks), 1) fisher_block = list(layer_collection.fisher_blocks.values())[0] self.assertIsInstance(fisher_block, fb.FullyConnectedKFACBasicFB) self.assertEqual(fisher_block.num_registered_towers, num_towers) global_step = tf.train.get_or_create_global_step() opt = optimizer.KfacOptimizer( learning_rate=0.1, cov_ema_decay=0.1, damping=0.1, layer_collection=layer_collection, momentum=0.1) cost = tf.reduce_mean(losses) (cov_update_thunks, inv_update_thunks) = opt.make_vars_and_create_op_thunks() cov_update_op = tf.group(*(thunk() for thunk in cov_update_thunks)) inv_update_op = tf.group(*(thunk() for thunk in inv_update_thunks)) train_op = opt.minimize(cost, global_step=global_step) init = tf.global_variables_initializer() # Run a single training step. with self.test_session() as sess: sess.run(init) sess.run([cov_update_op]) sess.run([inv_update_op]) sess.run([train_op])