def testTrainingMomentum(self): with self.session() as sess: x = array_ops.placeholder(datatype, shape=[1, 224, 224, 4]) y_ = array_ops.placeholder(datatype, shape=[1, 1000]) with ipu.scopes.ipu_scope("/device:IPU:0"): logits = inference(x) loss = math_ops.reduce_mean( nn_ops.softmax_cross_entropy_with_logits_v2( logits=logits, labels=array_ops.stop_gradient(y_))) train = momentum.MomentumOptimizer(0.01, 0.9).minimize(loss) report = tu.ReportJSON(self, sess) sess.run(variables.global_variables_initializer()) report.reset() data = np.zeros([1, 224, 224, 4]) labels = np.zeros([1, 1000]) sess.run(train, feed_dict={x: data, y_: labels}) report.parse_log() report.assert_total_tile_memory(38642237)
def testSharing(self): for dtype in [dtypes.half, dtypes.float32, dtypes.float64]: # train.MomentumOptimizer is V1 only API. with ops.Graph().as_default(), self.cached_session(): var0 = variables.Variable([1.0, 2.0], dtype=dtype) var1 = variables.Variable([3.0, 4.0], dtype=dtype) grads0 = constant_op.constant([0.1, 0.1], dtype=dtype) grads1 = constant_op.constant([0.01, 0.01], dtype=dtype) mom_opt = momentum_lib.MomentumOptimizer(learning_rate=2.0, momentum=0.9) mom_update1 = mom_opt.apply_gradients( zip([grads0, grads1], [var0, var1])) mom_update2 = mom_opt.apply_gradients( zip([grads0, grads1], [var0, var1])) self.evaluate(variables.global_variables_initializer()) self.assertEqual(["momentum"], mom_opt.get_slot_names()) slot0 = mom_opt.get_slot(var0, "momentum") self.assertEqual(slot0.get_shape(), var0.get_shape()) slot1 = mom_opt.get_slot(var1, "momentum") self.assertEqual(slot1.get_shape(), var1.get_shape()) # Fetch params to validate initial values self.assertAllClose([1.0, 2.0], self.evaluate(var0)) self.assertAllClose([3.0, 4.0], self.evaluate(var1)) # Step 1: the momentum accumulators where 0. So we should see a normal # update: v -= grad * learning_rate mom_update1.run() # Check that the momentum accumulators have been updated. self.assertAllCloseAccordingToType(np.array([0.1, 0.1]), self.evaluate(slot0)) self.assertAllCloseAccordingToType(np.array([0.01, 0.01]), self.evaluate(slot1)) # Check that the parameters have been updated. self.assertAllCloseAccordingToType( np.array([1.0 - (0.1 * 2.0), 2.0 - (0.1 * 2.0)]), self.evaluate(var0)) self.assertAllCloseAccordingToType( np.array([3.0 - (0.01 * 2.0), 4.0 - (0.01 * 2.0)]), self.evaluate(var1)) # Step 2: the second momentum accumulators contain the previous update. mom_update2.run() # Check that the momentum accumulators have been updated. self.assertAllCloseAccordingToType( np.array([(0.9 * 0.1 + 0.1), (0.9 * 0.1 + 0.1)]), self.evaluate(slot0)) self.assertAllCloseAccordingToType( np.array([(0.9 * 0.01 + 0.01), (0.9 * 0.01 + 0.01)]), self.evaluate(slot1)) # Check that the parameters have been updated. self.assertAllCloseAccordingToType( np.array([ 1.0 - (0.1 * 2.0) - ((0.9 * 0.1 + 0.1) * 2.0), 2.0 - (0.1 * 2.0) - ((0.9 * 0.1 + 0.1) * 2.0) ]), self.evaluate(var0)) self.assertAllCloseAccordingToType( np.array([ 2.98 - ((0.9 * 0.01 + 0.01) * 2.0), 3.98 - ((0.9 * 0.01 + 0.01) * 2.0) ]), self.evaluate(var1))
def testIterationsNotMultiple(self): def dataset_parser(value): a = value b = (value + 10.) / 2.0 return a, b def dataset_fn(): dataset = tu.create_single_increasing_dataset(5, shape=[4, 4, 2]) dataset = dataset.batch(batch_size=2, drop_remainder=True) return dataset.map(dataset_parser) def model(c, x, b): with variable_scope.variable_scope("vs", use_resource=True): y = layers.Conv2D(2, 1, use_bias=True, kernel_initializer=init_ops.ones_initializer(), name='conv1')(x) y = y + b y = math_ops.reduce_sum(y) + c return y def inputs_fn(): with ops.device('cpu'): return [array_ops.placeholder(np.float32, shape=[])] with self.assertRaisesRegex( errors.FailedPreconditionError, 'Detected a gradient accumulation operation with 32'): _gradient_accumulation_loop(self, model, inputs_fn, [10.01], 3, 32, dataset_fn, momentum.MomentumOptimizer(0.01, 0.9), 10)
def testTensorLearningRateAndMomentum(self): for dtype in self.float_types: with self.session(), self.test_scope(): var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype) var1 = resource_variable_ops.ResourceVariable([3.0, 4.0], dtype=dtype) grads0 = constant_op.constant([0.1, 0.1], dtype=dtype) grads1 = constant_op.constant([0.01, 0.01], dtype=dtype) mom_opt = momentum_lib.MomentumOptimizer( learning_rate=constant_op.constant(2.0), momentum=constant_op.constant(0.9)) mom_update = mom_opt.apply_gradients( zip([grads0, grads1], [var0, var1])) self.evaluate(variables.global_variables_initializer()) # Check we have slots self.assertEqual(["momentum"], mom_opt.get_slot_names()) slot0 = mom_opt.get_slot(var0, "momentum") self.assertEqual(slot0.get_shape(), var0.get_shape()) self.assertFalse(slot0 in variables.trainable_variables()) slot1 = mom_opt.get_slot(var1, "momentum") self.assertEqual(slot1.get_shape(), var1.get_shape()) self.assertFalse(slot1 in variables.trainable_variables()) # Fetch params to validate initial values self.assertAllClose([1.0, 2.0], self.evaluate(var0)) self.assertAllClose([3.0, 4.0], self.evaluate(var1)) # Step 1: the momentum accumulators where 0. So we should see a normal # update: v -= grad * learning_rate mom_update.run() # Check that the momentum accumulators have been updated. self.assertAllCloseAccordingToType( np.array([0.1, 0.1]), self.evaluate(slot0)) self.assertAllCloseAccordingToType( np.array([0.01, 0.01]), self.evaluate(slot1)) # Check that the parameters have been updated. self.assertAllCloseAccordingToType( np.array([1.0 - (0.1 * 2.0), 2.0 - (0.1 * 2.0)]), self.evaluate(var0)) self.assertAllCloseAccordingToType( np.array([3.0 - (0.01 * 2.0), 4.0 - (0.01 * 2.0)]), self.evaluate(var1)) # Step 2: the momentum accumulators contain the previous update. mom_update.run() # Check that the momentum accumulators have been updated. self.assertAllCloseAccordingToType( np.array([(0.9 * 0.1 + 0.1), (0.9 * 0.1 + 0.1)]), self.evaluate(slot0)) self.assertAllCloseAccordingToType( np.array([(0.9 * 0.01 + 0.01), (0.9 * 0.01 + 0.01)]), self.evaluate(slot1)) # Check that the parameters have been updated. self.assertAllCloseAccordingToType( np.array([ 1.0 - (0.1 * 2.0) - ((0.9 * 0.1 + 0.1) * 2.0), 2.0 - (0.1 * 2.0) - ((0.9 * 0.1 + 0.1) * 2.0) ]), self.evaluate(var0)) self.assertAllCloseAccordingToType( np.array([ 2.98 - ((0.9 * 0.01 + 0.01) * 2.0), 3.98 - ((0.9 * 0.01 + 0.01) * 2.0) ]), self.evaluate(var1))
def testCompare3(self): def dataset_fn(): dataset = tu.create_single_increasing_dataset(10, shape=[4]) dataset = dataset.batch(batch_size=2, drop_remainder=True) def dataset_parser(value): label = math_ops.reduce_mean(value, axis=[1]) return math_ops.cast(value, np.int32), math_ops.cast(label / 10, np.int32) return dataset.map(dataset_parser) num_batches_to_accumulate = 20 repeat_count = 2 optimizer = momentum.MomentumOptimizer(0.01, 0.8) def fwd_fn(idx, label): with variable_scope.variable_scope("part1", use_resource=True): embedding = variable_scope.get_variable( "c", shape=[10, 1216], dtype=np.float32, initializer=init_ops.constant_initializer(10.01), trainable=True) x = embedding_ops.embedding_lookup(embedding, idx) logits = math_ops.reduce_sum(x, axis=[-1]) loss = math_ops.reduce_mean( nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=label)) return loss _compare_to_cpu(self, fwd_fn, lambda: [], [], repeat_count, num_batches_to_accumulate, dataset_fn, optimizer)
def testThatBackpropRuns(self): """Run optimization to ensure that gradients can be computed.""" batch_size = 1 image_height = 9 image_width = 12 image = variables.Variable( np.float32( np.random.uniform( size=[batch_size, image_height, image_width, 3]))) control_point_locations = [[3., 3.]] control_point_locations = constant_op.constant( np.float32(np.expand_dims(control_point_locations, 0))) control_point_displacements = [[0.25, -0.5]] control_point_displacements = constant_op.constant( np.float32(np.expand_dims(control_point_displacements, 0))) warped_image, _ = sparse_image_warp.sparse_image_warp( image, control_point_locations, control_point_locations + control_point_displacements, num_boundary_points=3) loss = math_ops.reduce_mean(math_ops.abs(warped_image - image)) optimizer = momentum.MomentumOptimizer(0.001, 0.9) grad = gradients.gradients(loss, [image]) grad, _ = clip_ops.clip_by_global_norm(grad, 1.0) opt_func = optimizer.apply_gradients(zip(grad, [image])) init_op = variables.global_variables_initializer() with self.test_session() as sess: sess.run(init_op) for _ in range(5): sess.run([loss, opt_func])
def test_interpolation_gradient(self): """Make sure that backprop can run. Correctness of gradients is assumed. Here, we create a use a small 'training' set and a more densely-sampled set of query points, for which we know the true value in advance. The goal is to choose x locations for the training data such that interpolating using this training data yields the best reconstruction for the function values at the query points. The training data locations are optimized iteratively using gradient descent. """ tp = _QuadraticPlusSinProblemND() (query_points, query_values, train_points, train_values) = tp.get_problem(optimizable=True) regularization = 0.001 for interpolation_order in (1, 2, 3, 4): interpolator = interpolate_spline.interpolate_spline( train_points, train_values, query_points, interpolation_order, regularization) loss = math_ops.reduce_mean( math_ops.square(query_values - interpolator)) optimizer = momentum.MomentumOptimizer(0.001, 0.9) grad = gradients.gradients(loss, [train_points]) grad, _ = clip_ops.clip_by_global_norm(grad, 1.0) opt_func = optimizer.apply_gradients(zip(grad, [train_points])) init_op = variables.global_variables_initializer() with self.test_session() as sess: sess.run(init_op) for _ in range(100): sess.run([loss, opt_func])
def testVariablesAcrossGraphs(self): optimizer = momentum_lib.MomentumOptimizer(0.01, 0.5) with ops.Graph().as_default(): var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtypes.float32, name="var0") var1 = resource_variable_ops.ResourceVariable([3.0, 4.0], dtype=dtypes.float32, name="var1") loss = math_ops.reduce_sum(var0 + var1) optimizer.minimize(loss) optimizer_variables = optimizer.variables() self.assertStartsWith(optimizer_variables[0].name, "var0") self.assertStartsWith(optimizer_variables[1].name, "var1") self.assertEqual(2, len(optimizer_variables)) with ops.Graph().as_default(): var2 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtypes.float32, name="var2") var3 = resource_variable_ops.ResourceVariable([3.0, 4.0], dtype=dtypes.float32, name="var3") loss = math_ops.reduce_sum(var2 + var3) optimizer.minimize(loss) optimizer_variables = optimizer.variables() self.assertStartsWith(optimizer_variables[0].name, "var2") self.assertStartsWith(optimizer_variables[1].name, "var3") self.assertEqual(2, len(optimizer_variables))
def testIndexedSlicesGradient(self): with ops.Graph().as_default(): embedding_matrix = variable_scope.get_variable( "embedding_matrix", [5, 5], initializer=init_ops.random_normal_initializer()) def cond(it, _): return it < 5 def body(it, cost): embedding = embedding_ops.embedding_lookup( embedding_matrix + 0.0, [0]) cost += math_ops.reduce_sum(embedding) return it + 1, cost _, cost = control_flow_ops.while_loop( cond, body, [constant_op.constant(0), constant_op.constant(0.0)]) optimizer = momentum.MomentumOptimizer(0.1, 0.9) train_op = optimizer.minimize(cost) with self.cached_session() as sess: sess.run(variables.global_variables_initializer()) for _ in range(10): sess.run([train_op])
def testNumericEquivalenceForNesterovMomentum(self): if testing_utils.should_run_tf_function() or context.executing_eagerly(): self.skipTest( 'v1 optimizer does not run in experimental_run_tf_function mode or ' 'eager mode') np.random.seed(1331) with self.cached_session(): train_samples = 20 input_dim = 3 num_classes = 2 (x, y), _ = testing_utils.get_test_data( train_samples=train_samples, test_samples=10, input_shape=(input_dim,), num_classes=num_classes) y = keras.utils.to_categorical(y) num_hidden = 5 model_k_v1 = testing_utils.get_small_sequential_mlp( num_hidden=num_hidden, num_classes=num_classes, input_dim=input_dim) model_k_v2 = testing_utils.get_small_sequential_mlp( num_hidden=num_hidden, num_classes=num_classes, input_dim=input_dim) model_k_v2.set_weights(model_k_v1.get_weights()) model_tf = testing_utils.get_small_sequential_mlp( num_hidden=num_hidden, num_classes=num_classes, input_dim=input_dim) model_tf.set_weights(model_k_v2.get_weights()) opt_k_v1 = optimizers.SGD(momentum=0.9, nesterov=True) opt_k_v2 = gradient_descent.SGD(momentum=0.9, nesterov=True) opt_tf = momentum.MomentumOptimizer( learning_rate=0.01, momentum=0.9, use_nesterov=True) model_k_v1.compile( opt_k_v1, loss='categorical_crossentropy', metrics=[], run_eagerly=testing_utils.should_run_eagerly(), experimental_run_tf_function=testing_utils.should_run_tf_function()) model_k_v2.compile( opt_k_v2, loss='categorical_crossentropy', metrics=[], run_eagerly=testing_utils.should_run_eagerly(), experimental_run_tf_function=testing_utils.should_run_tf_function()) model_tf.compile( opt_tf, loss='categorical_crossentropy', metrics=[], run_eagerly=testing_utils.should_run_eagerly(), experimental_run_tf_function=testing_utils.should_run_tf_function()) hist_k_v1 = model_k_v1.fit(x, y, batch_size=5, epochs=10, shuffle=False) hist_k_v2 = model_k_v2.fit(x, y, batch_size=5, epochs=10, shuffle=False) hist_tf = model_tf.fit(x, y, batch_size=5, epochs=10, shuffle=False) self.assertAllClose(model_k_v1.get_weights(), model_tf.get_weights()) self.assertAllClose(model_k_v1.get_weights(), model_k_v2.get_weights()) self.assertAllClose(opt_k_v1.get_weights(), opt_k_v2.get_weights()) self.assertAllClose(hist_k_v1.history['loss'], hist_tf.history['loss']) self.assertAllClose(hist_k_v1.history['loss'], hist_k_v2.history['loss'])
def testNesterovMomentum(self): for dtype in [dtypes.float32, dtypes.float64]: # train.MomentumOptimizer is V1 only API. with ops.Graph().as_default(), self.cached_session(): var0 = variables.Variable([1.0, 2.0], dtype=dtype) var1 = variables.Variable([3.0, 4.0], dtype=dtype) var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype) var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype) accum0_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype) accum1_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype) cost = 5 * var0 * var0 + 3 * var1 global_step = variables.Variable(array_ops.zeros([], dtypes.int64), name="global_step") mom_op = momentum_lib.MomentumOptimizer(learning_rate=2.0, momentum=0.9, use_nesterov=True) opt_op = mom_op.minimize(cost, global_step, [var0, var1]) self.evaluate(variables.global_variables_initializer()) for t in range(1, 5): opt_op.run() var0_np, accum0_np = self._update_nesterov_momentum_numpy( var0_np, accum0_np, var0_np * 10, 2.0, 0.9) var1_np, accum1_np = self._update_nesterov_momentum_numpy( var1_np, accum1_np, 3, 2.0, 0.9) self.assertAllClose(var0_np, self.evaluate(var0)) self.assertAllClose(var1_np, self.evaluate(var1))
def testMinimizeSparseResourceVariable(self): for dtype in [dtypes.half, dtypes.float32, dtypes.float64]: # This test invokes the ResourceSparseApplyMomentum operation, which # did not have a registered GPU kernel as of April 2018. With graph # execution, the placement algorithm notices this and automatically # places the variable in CPU (host) memory. With eager execution, # the variable would be placed in GPU memory if available, which # would then conflict with the future invocation of the # ResourceSparseApplyMomentum operation. # To work around this discrepancy, for now we force the variable # to be placed on CPU. with ops.device("/cpu:0"): var0 = resource_variable_ops.ResourceVariable([[1.0, 2.0]], dtype=dtype) # pylint: disable=cell-var-from-loop def loss(): x = constant_op.constant([[4.0], [5.0]], dtype=dtype) pred = math_ops.matmul( embedding_ops.embedding_lookup([var0], [0]), x) return pred * pred # pylint: enable=cell-var-from-loop opt = momentum_lib.MomentumOptimizer(learning_rate=1.0, momentum=0.0) sgd_op = opt.minimize(loss) self.evaluate(variables.global_variables_initializer()) # Run 1 step of sgd self.evaluate(sgd_op) # Validate updated params self.assertAllCloseAccordingToType([[-111, -138]], self.evaluate(var0))
def testNesterovMomentum(self): for dtype in self.float_types: with self.cached_session(), self.test_scope(): var0 = resource_variable_ops.ResourceVariable([0.1, 0.2], dtype=dtype) var1 = resource_variable_ops.ResourceVariable([0.3, 0.4], dtype=dtype) var0_np = np.array([0.1, 0.2], dtype=dtype) var1_np = np.array([0.3, 0.4], dtype=dtype) accum0_np = np.array([0.0, 0.0], dtype=dtype) accum1_np = np.array([0.0, 0.0], dtype=dtype) cost = 0.4 * var0 * var0 + 0.9 * var1 global_step = resource_variable_ops.ResourceVariable( array_ops.zeros([], dtypes.int32), name="global_step") mom_op = momentum_lib.MomentumOptimizer(learning_rate=0.1, momentum=0.9, use_nesterov=True) opt_op = mom_op.minimize(cost, global_step, [var0, var1]) variables.global_variables_initializer().run() for _ in range(1, 5): opt_op.run() var0_np, accum0_np = self._update_nesterov_momentum_numpy( var0_np, accum0_np, var0_np * 0.8, 0.1, 0.9) var1_np, accum1_np = self._update_nesterov_momentum_numpy( var1_np, accum1_np, 0.9, 0.1, 0.9) self.assertAllCloseAccordingToType(var0_np, var0.eval()) self.assertAllCloseAccordingToType(var1_np, var1.eval())
def testDynamicLossScaleWithSlots(self, strategy_fn): with strategy_fn().scope() as strategy: var = variables.Variable([1.0, 2.0]) # An SGD optimizer with momentum has slot variables. opt = momentum.MomentumOptimizer(1.0, momentum=1.) initial_loss_scale = 2. loss_scale = loss_scale_module.DynamicLossScale( initial_loss_scale=initial_loss_scale, increment_period=1, multiplier=4) opt = loss_scale_optimizer.MixedPrecisionLossScaleOptimizer( opt, loss_scale) loss = lambda: var / strategy.num_replicas_in_sync run_fn = lambda: opt.minimize(loss, var_list=[var]) run_op = strategy.experimental_run(run_fn) self.evaluate(variables.global_variables_initializer()) self._run_if_in_graph_mode(run_op) # The momentum accumulator starts at 0 and the gradient is 1. The # accumulator is incremented by the gradient, so it is now 1. Then the # variable is subtracted by the accumulator, so the variable is subtracted # by 1. self.assertAllClose([0.0, 1.0], self.evaluate(var)) self.assertEqual(self.evaluate(opt._loss_scale()), initial_loss_scale * 4) run_op = strategy.experimental_run(run_fn) self._run_if_in_graph_mode(run_op) # The momentum accumulator was 1 before this step and the gradient is 1. # The accumulator is incremented by the gradient, so it is now 2. Then the # variable is subtracted by the accumulator, so the variable is subtracted # by 2. self.assertAllClose([-2., -1.], self.evaluate(var)) self.assertEqual(self.evaluate(opt._loss_scale()), initial_loss_scale * 16)
def test_ops_with_var_and_momentum(self): var_list = [ deo.get_variable('sp_var', initializer=0.0, dim=2), ] opt_list = [ momentum.MomentumOptimizer(0.1, 0.1), ] self.common_run_context(var_list, opt_list, name='momentum_test')
def get_multiple_optimizers(): return [ adagrad.AdagradOptimizer(0.1), adam.AdamOptimizer(0.1), ftrl.FtrlOptimizer(0.1), momentum.MomentumOptimizer(0.1, 0.1), rmsprop.RMSPropOptimizer(0.1) ]
def body(x, label): logits = inference(x) loss = math_ops.reduce_mean( nn_ops.softmax_cross_entropy_with_logits_v2( logits=logits, labels=array_ops.stop_gradient(label))) return x, label, momentum.MomentumOptimizer( 0.01, 0.9).minimize(loss)
def testPipelineCompareRecomputeDropout(self): def dataset_fn(): dataset = tu.create_single_increasing_dataset(7, shape=[4, 4]) def dataset_parser(value): img = value label = value[0][0] % 4 return img, math_ops.cast(label, np.int32) dataset = dataset.map(dataset_parser) return dataset.batch(batch_size=2, drop_remainder=True) gradient_accumulation_count = 24 repeat_count = 2 optimizer = momentum.MomentumOptimizer(0.01, 0.98) def stage(x, name): with variable_scope.variable_scope(name, use_resource=True): weight = variable_scope.get_variable( "w", shape=[4, 4], dtype=np.float32, initializer=init_ops.ones_initializer()) x = math_ops.matmul(x, weight) x = rand_ops.dropout(x, seed=[10, 10]) return x def stage1(x, label): return stage(x, "s1"), label def stage2(x, label): return stage(x, "s2"), label def stage3(x, label): x = stage(x, "s3") logits = math_ops.reduce_sum(x, axis=[1]) loss = math_ops.reduce_mean( nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=label)) return loss def inputs_fn(): with ops.device('cpu'): return [] pipelining_test_util.PipelineTester.compare_pipeline_to_sharding( [stage1, stage2, stage3], inputs_fn, [10.01], repeat_count, gradient_accumulation_count, dataset_fn, optimizer, self, 21458, recomp=True, schedule=pipelining_ops.PipelineSchedule.Grouped)
def body(x, label): logits = inference(x) loss = math_ops.reduce_mean( nn_ops.softmax_cross_entropy_with_logits_v2( logits=logits, labels=array_ops.stop_gradient(label))) opt = gradient_accumulation_optimizer.GradientAccumulationOptimizer( momentum.MomentumOptimizer(0.01, 0.9), 10) return x, label, opt.minimize(loss)
def testMinimizeWith2DIndiciesForEmbeddingLookup(self): var0 = resource_variable_ops.ResourceVariable(array_ops.ones([2, 2])) def loss(): return math_ops.reduce_sum(embedding_ops.embedding_lookup(var0, [[1]])) opt = momentum_lib.MomentumOptimizer(learning_rate=1.0, momentum=0.0) sgd_op = opt.minimize(loss) self.evaluate(variables.global_variables_initializer()) self.evaluate(sgd_op) self.assertAllCloseAccordingToType([[1, 1], [0, 0]], self.evaluate(var0))
def testReturningNonTensorRaisesError(self): optimizer = momentum.MomentumOptimizer(learning_rate=1.0, momentum=1.0) optimizer.apply_gradients = function.defun(optimizer.apply_gradients) v = resource_variable_ops.ResourceVariable(1.0) grad = backprop.implicit_grad(lambda v: v**2)(v) with self.assertRaisesRegexp(TypeError, '.*must return zero or more Tensors.*'): # TODO(akshayka): We might want to allow defun-ing Python functions # that return operations (and just execute the op instead of running it). optimizer.apply_gradients(grad)
def testLikeDistBeliefMom01(self): with self.cached_session(): db_grad, db_out = self._dbParamsMom01() num_samples = len(db_grad) var0 = variables.Variable([0.0] * num_samples) grads0 = constant_op.constant([0.0] * num_samples) mom_opt = momentum_lib.MomentumOptimizer(learning_rate=0.1, momentum=0.1) mom_update = mom_opt.apply_gradients(zip([grads0], [var0])) variables.global_variables_initializer().run() for i in xrange(num_samples): mom_update.run(feed_dict={grads0: db_grad[i]}) self.assertAllClose(np.array(db_out[i]), self.evaluate(var0))
def model(features): a = variable_scope.get_variable("a", initializer=1.0) def body(x): return a * x logits = ipu.loops.repeat(5, body, [features]) loss = math_ops.reduce_sum(logits) optimizer = momentum.MomentumOptimizer(learning_rate=.001, momentum=0.9) grads_and_vars = optimizer.compute_gradients(loss) train_op = optimizer.apply_gradients(grads_and_vars) return a, loss, train_op
def _GetOptimizer(self, opt): if opt == "adagrad": return adagrad.AdagradOptimizer(learning_rate=1e-2) elif opt == "adam": return adam.AdamOptimizer(learning_rate=1e-2) elif opt == "rmsprop": return rmsprop.RMSPropOptimizer(learning_rate=1e-2) elif opt == "momentum": return momentum.MomentumOptimizer(learning_rate=1e-2, momentum=0.9) elif opt == "sgd": return gradient_descent.GradientDescentOptimizer(learning_rate=1e-2) else: raise ValueError("Unsupported optimizer: %s" % opt)
def testOptimizerInDefunWithCapturedVariable(self): v = resource_variable_ops.ResourceVariable(1.0) def loss(): return v**2 optimizer = momentum.MomentumOptimizer(learning_rate=1.0, momentum=1.0) @function.defun def train(): grad = backprop.implicit_grad(loss)() optimizer.apply_gradients(grad) train() self.assertEqual(v.numpy(), -1.0)
def testOptimizerInDefun(self): def loss(v): return v**2 optimizer = momentum.MomentumOptimizer(learning_rate=1.0, momentum=1.0) @function.defun def train(): self.v = resource_variable_ops.ResourceVariable(1.0) grad = backprop.implicit_grad(loss)(self.v) optimizer.apply_gradients(grad) return self.v.read_value() value = train() self.assertEqual(value.numpy(), -1.0)
def testMinimizeSparseResourceVariable(self): for dtype in [dtypes.half, dtypes.float32, dtypes.float64]: with self.test_session(): var0 = resource_variable_ops.ResourceVariable([[1.0, 2.0]], dtype=dtype) x = constant_op.constant([[4.0], [5.0]], dtype=dtype) pred = math_ops.matmul(embedding_ops.embedding_lookup([var0], [0]), x) loss = pred * pred sgd_op = momentum_lib.MomentumOptimizer( learning_rate=1.0, momentum=0.0).minimize(loss) variables.global_variables_initializer().run() # Fetch params to validate initial values self.assertAllCloseAccordingToType([[1.0, 2.0]], var0.eval()) # Run 1 step of sgd sgd_op.run() # Validate updated params self.assertAllCloseAccordingToType( [[-111, -138]], var0.eval())
def testMinimizeSparseResourceVariable(self): for dtype in [dtypes.half, dtypes.float32, dtypes.float64]: var0 = resource_variable_ops.ResourceVariable([[1.0, 2.0]], dtype=dtype) # pylint: disable=cell-var-from-loop def loss(): x = constant_op.constant([[4.0], [5.0]], dtype=dtype) pred = math_ops.matmul(embedding_ops.embedding_lookup([var0], [0]), x) return pred * pred # pylint: enable=cell-var-from-loop opt = momentum_lib.MomentumOptimizer(learning_rate=1.0, momentum=0.0) sgd_op = opt.minimize(loss) self.evaluate(variables.global_variables_initializer()) # Run 1 step of sgd self.evaluate(sgd_op) # Validate updated params self.assertAllCloseAccordingToType([[-111, -138]], self.evaluate(var0))
def testMetagraph(self): with ops.Graph().as_default(): with variable_scope.variable_scope("foo", use_resource=True): a = variable_scope.get_variable("a", initializer=10.0) momentum.MomentumOptimizer( learning_rate=0.001, momentum=0.1).minimize( a, colocate_gradients_with_ops=True, global_step=training_util.get_or_create_global_step()) graph = ops.get_default_graph() meta_graph_def = saver.export_meta_graph(graph=graph) with ops.Graph().as_default(): saver.import_meta_graph(meta_graph_def, import_scope="") meta_graph_two = saver.export_meta_graph(graph=graph) self.assertEqual(meta_graph_def, meta_graph_two)
def testIndexedSlicesGradient(self): with ops.Graph().as_default(): embedding_matrix = tf.get_variable( "embedding_matrix", [5, 5], initializer=tf.random_normal_initializer()) def Cond(it, _): return it < 5 def Body(it, cost): embedding = embedding_ops.embedding_lookup(embedding_matrix + 0.0, [0]) cost += tf.reduce_sum(embedding) return it + 1, cost _, cost = control_flow_ops.While( Cond, Body, [tf.constant(0), tf.constant(0.0)]) optimizer = momentum.MomentumOptimizer(0.1, 0.9) train_op = optimizer.minimize(cost) with self.test_session() as sess: sess.run(tf.initialize_all_variables()) for _ in range(10): sess.run([train_op])