def _Apply2(proj_layer, opt): inputs1 = np_input1 output1 = proj_layer.FPropDefaultTheta(inputs1, in_padding1) loss2_1 = tf.reduce_sum(output1) var_grads2_1 = py_utils.ComputeGradients(loss2_1, proj_layer.vars) grads2_1 = var_grads2_1.Transform(tuple) inputs1 = np_input2 output1 = proj_layer.FPropDefaultTheta(inputs1, in_padding1) loss2_2 = tf.reduce_sum(output1) var_grads2_2 = py_utils.ComputeGradients(loss2_2, proj_layer.vars) grads2_2 = var_grads2_2.Transform(tuple) with cluster_factory.ForTestingWorker(add_summary=True): _ = opt.Apply(lr, var_grads2_1) # Get `snapshots` of the intermediate variables vars2_intermediate = [v.read_value() for v in proj_layer.vars.Flatten()] tf.assign_add(py_utils.GetOrCreateGlobalStepVar(), 1) with cluster_factory.ForTestingWorker(add_summary=True): _ = opt.Apply(lr, var_grads2_2) vars2_1 = proj_layer.vars.Flatten() return vars2_intermediate, vars2_1, grads2_1, grads2_2
def testMaskGradient(self): with self.session(use_gpu=False) as sess: a = tf.get_variable('a', []) b = tf.get_variable('b', []) c = tf.get_variable('c', []) d = tf.get_variable('d', []) e = tf.get_variable('e', []) l = a + b + c + d zeros = tf.zeros(3, dtype=tf.float32) select = tf.one_hot(1, 3, dtype=tf.float32) vmap = py_utils.NestedMap( a=a, b=b, c=c, d=d, n=py_utils.NestedMap(aa=a, e=e)) grad_mask = py_utils.NestedMap() grad_mask['a:0'] = zeros grad_mask['b:0'] = zeros grad_mask['c:0'] = select grad_mask['d:0'] = select grad_onehot = tf.one_hot(1, 3, dtype=tf.float32) var_grads = py_utils.ComputeGradients(l, vmap) var_grads_mask = py_utils.MaskGradients(var_grads, grad_mask, grad_onehot) sess.run(tf.global_variables_initializer()) _, var_grads_mask_vals = sess.run([var_grads, var_grads_mask]) # 'a' and 'b' are masked, while 'c' and 'd' are not. self.assertEqual(var_grads_mask_vals['a'][1], 0) self.assertEqual(var_grads_mask_vals['b'][1], 0) self.assertEqual(var_grads_mask_vals['c'][1], 1) self.assertEqual(var_grads_mask_vals['d'][1], 1)
def testGradientMult(self): with self.session(use_gpu=False, graph=tf.Graph()): p = self._testParams() mdl = p.Instantiate() mdl.FPropDefaultTheta() var_grads = py_utils.ComputeGradients(mdl.loss, mdl.vars) py_utils.ApplyGradMultiplier(var_grads, -1.1)
def testCollectVarHistogram(self): with self.session(use_gpu=False, graph=tf.Graph()): p = self._testParams() mdl = p.Instantiate() mdl.FPropDefaultTheta() var_grads = py_utils.ComputeGradients(mdl.loss, mdl.vars) summary_utils.CollectVarHistogram(var_grads)
def testSkipL1Regularization(self): with self.session(use_gpu=False) as sess: beta = tf.get_variable( 'beta', initializer=tf.constant(np.arange(10).reshape([1, 10]), tf.float32)) tf.add_to_collection(py_utils.SKIP_LP_REGULARIZATION, beta) gamma = tf.get_variable( 'gamma', initializer=tf.constant(np.arange(10).reshape([1, 10]), tf.float32)) act = tf.constant(np.arange(10).reshape([1, 10]), tf.float32) pred = act * gamma + beta loss = tf.reduce_sum(pred) vmap = py_utils.NestedMap(beta=beta, gamma=gamma) var_grads = py_utils.ComputeGradients(loss, vmap) self.assertEqual(sorted(var_grads.keys()), ['beta', 'gamma']) l1_loss, var_grads_with_l1 = py_utils.AdjustGradientsWithLpLoss( var_grads, 0.1, p=1.0) sess.run(tf.global_variables_initializer()) var_grads_vals, l1_loss_val, var_grads_with_l1_vals = sess.run( [var_grads, l1_loss, var_grads_with_l1]) print('var_grads_vals = ', var_grads_vals) print('var_grads_with_l1_vals = ', var_grads_with_l1_vals) self.assertAllEqual(var_grads_vals.beta[0], var_grads_with_l1_vals.beta[0]) self.assertAllEqual(var_grads_vals.gamma[0], var_grads_with_l1_vals.gamma[0]) self.assertAllEqual(l1_loss_val, 0.1 * np.sum(np.abs(var_grads_vals.gamma[0])))
def _Apply1(proj_layer, opt): output1 = proj_layer.FPropDefaultTheta(inputs1, in_padding1) output2 = proj_layer.FPropDefaultTheta(inputs2, in_padding2) loss1 = tf.reduce_sum(output1) loss2 = tf.reduce_sum(output2) var_grads1 = py_utils.ComputeGradients(loss1, proj_layer.vars) var_grads2 = py_utils.ComputeGradients(loss2, proj_layer.vars) _ = opt.Apply(lr, py_utils.ApplyGradMultiplier(var_grads1, 1. / 2.)) _ = opt.Apply(lr, py_utils.ApplyGradMultiplier(var_grads2, 1. / 2.)) vars1_1 = proj_layer.vars.Flatten() grads1_1 = var_grads1.Transform(tuple) grads1_2 = var_grads2.Transform(tuple) return vars1_1, grads1_1, grads1_2
def _FpropBprop(self, fc_layer, opt): inputs = tf.zeros(shape=[2, 4, 3], dtype=tf.float64) output = fc_layer.FPropDefaultTheta(inputs) loss = tf.reduce_sum(output) var_grads = py_utils.ComputeGradients(loss, fc_layer.vars) # Name becomes meaningless in Eager mode. Here we just check whether # errors get raised. update_op = opt.Apply(1e-1, var_grads) self.assertIn('composite_optimizer_train_op', update_op.name)
def testComputeGradient(self): with self.session(use_gpu=False): a = tf.get_variable('a', []) b = tf.get_variable('b', [], trainable=False) c = tf.get_variable('c', []) e = tf.get_variable('e', []) l = a + b + tf.stop_gradient(c) vmap = py_utils.NestedMap( a=a, b=b, c=c, d=None, n=py_utils.NestedMap(aa=a, e=e)) var_grads = py_utils.ComputeGradients(l, vmap) print('var_grads = ', var_grads.DebugString()) # Only 'a' matters. b is not trainable; c has stop_gradient; d # is None; e is not computed by l and aa is a duplicated. self.assertEqual([_[0] for _ in var_grads.FlattenItems()], ['a']) self.assertEqual(var_grads.a[0].name, 'a:0')
def testCompositeOptimizer(self): adam_op = optimizer.Adam.Params() rmsprop_op = optimizer.RMSProp.Params() adam_rmsprop_opt = optimizer.CompositeOptimizer.Params().Set( optimizer_map={ 'fc/w': (adam_op, 1.), 'fc/b': (rmsprop_op, 1.), 'default_optimizer': (adam_op, 1.) }).Instantiate() adam_op_2 = optimizer.Adam.Params().Set(name='adam_2') unspecified_comp_opt = optimizer.CompositeOptimizer.Params().Set( optimizer_map={ 'fc/w': (adam_op_2, 1.), 'default_optimizer': (adam_op_2, 1.) }).Instantiate() sgd_op = optimizer.SGD.Params() adagrad_op = optimizer.Adagrad.Params() overlapping_comp_opt = optimizer.CompositeOptimizer.Params().Set( optimizer_map={ 'fc/w': (sgd_op, 1.), '.': (adagrad_op, 1.), 'default_optimizer': (adagrad_op, 1.) }).Instantiate() params = layers.FCLayer.Params() params.name = 'fc' params.dtype = tf.float64 params.input_dim = 3 params.output_dim = 2 params.batch_norm = False fc_layer = layers.FCLayer(params) inputs = tf.placeholder(shape=[2, 4, 3], dtype=tf.float64) output = fc_layer.FPropDefaultTheta(inputs) loss = tf.reduce_sum(output) var_grads = py_utils.ComputeGradients(loss, fc_layer.vars) self.assertIn('composite_optimizer_train_op', adam_rmsprop_opt.Apply(1e-1, var_grads).name) self.assertIn('composite_optimizer_train_op', unspecified_comp_opt.Apply(1e-1, var_grads).name) with self.assertRaisesRegex( Exception, 'Variable fc/w/var:0 is matched 2 times by regex', ): overlapping_comp_opt.Apply(1e-1, var_grads)
def testAdjustGradientsWithL2Loss(self): with self.session(use_gpu=False) as sess: emb = tf.get_variable( 'emb', initializer=tf.constant(np.arange(100).reshape([10, 10]), tf.float32)) act = tf.gather(emb, [2, 5, 2, 2, 5]) weight = tf.get_variable( 'w', initializer=tf.constant(np.ones([10, 1]), tf.float32)) bias = tf.get_variable('b', initializer=tf.constant([0.217])) pred = tf.matmul(act, weight) + tf.stop_gradient(bias) loss = tf.reduce_sum(pred) vmap = py_utils.NestedMap(emb=emb, weight=weight, bias=bias) var_grads = py_utils.ComputeGradients(loss, vmap) self.assertEqual(sorted(var_grads.keys()), ['emb', 'weight']) l2_loss, var_grads_with_l2 = py_utils.AdjustGradientsWithLpLoss( var_grads, 0.1, p=2.0) sess.run(tf.global_variables_initializer()) var_grads_vals, l2_loss_val, var_grads_with_l2_vals = sess.run( [var_grads, l2_loss, var_grads_with_l2]) print('var_grads_vals = ', var_grads_vals) print('var_grads_with_l2_vals = ', var_grads_with_l2_vals) self.assertAllEqual(var_grads_vals.emb[0], var_grads_with_l2_vals.emb[0]) self.assertAllEqual(var_grads_vals.weight[0], var_grads_with_l2_vals.weight[0]) self.assertAllEqual( l2_loss_val, 0.5 * 0.1 * (np.sum(np.square(var_grads_vals.weight[0])) + np.sum( np.square(var_grads_vals.emb[0][2, :])) + np.sum( np.square(var_grads_vals.emb[0][5, :])))) # With l2, gradients of emb and weight are adjusted. self.assertAllClose( var_grads_with_l2_vals.weight[1], var_grads_vals.weight[1] + 0.1 * var_grads_vals.weight[0]) self.assertAllClose(var_grads_with_l2_vals.emb[1].indices, var_grads_vals.emb[1].indices) self.assertAllClose(var_grads_with_l2_vals.emb[1].indices, [2, 5, 2, 2, 5]) self.assertAllClose( var_grads_with_l2_vals.emb[1].values, var_grads_vals.emb[1].values + 0.1 * np.array([[1 / 3.], [1 / 2.], [1 / 3.], [1 / 3.], [1 / 2.] ]) * var_grads_vals.emb[0][[2, 5, 2, 2, 5], :])
def testRematerialize(self): # Test the dropout consistency between fprop and bprop. b = builder.Base.Params() b = b.Instantiate() start_block = layers.DeterministicDropoutLayer.Params().Set( name='start_dropout', keep_prob=0.7) # Build 4 dropout layers, each wrapped by RematerializeFn. num_blocks = 4 blocks = [] blocks_per_cell = 2 for i in range(num_blocks): blocks.append(layers.DeterministicDropoutLayer.Params().Set( name='dropout_{}'.format(i), keep_prob=0.7)) cells = [] while blocks: heads, blocks = blocks[:blocks_per_cell], blocks[blocks_per_cell:] cell_name = 'cell_{}'.format(len(cells)) cells.append( b._Rematerialize(name=cell_name, body=b._Seq(cell_name, *heads))) with self.session(use_gpu=False, graph=tf.Graph()) as sess: tf.random.set_seed(12345) p = b._Seq('test', start_block, *cells) mdl = p.Instantiate() # y = mdl.Frop(x * w) # Fake input x = tf.ones([4, 5]) # Construct weights. w = tf.get_variable('w', shape=[4, 5], initializer=tf.constant_initializer([[1] * 5] * 4)) y = mdl.FPropDefaultTheta(x * w) # Construct loss function such that gradients = final activation. # dy/dw = y = mdl.Frop(x * w) when w is 1. loss = tf.reduce_sum(y) grads = py_utils.ComputeGradients(loss, py_utils.NestedMap(w=w)) tf.global_variables_initializer().run() y_val, grads_val = sess.run([y, grads.Transform(tuple)]) grads_val = grads_val['w'][1] self.assertAllClose(y_val, grads_val) self.assertEqual(py_utils.GetStepSeed().eval(), 1553244033)
def testDropoutInRecurrent(self, splits=1, num_micro_batches=1): assert splits in [1, 2, 4] with self.session() as sess: tf.set_random_seed(12345) num_layers = 4 py_utils.GetOrCreateGlobalStep() # Build a model with 4 dropout layers. layers = [] for l in range(num_layers): layers.append(DeterministicDropoutLayer.Params().Set( name='dropout_{}'.format(l), keep_prob=0.7)) # Divide the model into splits partitions. cell_tpl = [] layers_per_split = num_layers // splits for i in range(splits): sub = layers[i * layers_per_split:(i + 1) * layers_per_split] cell_tpl.append(FeatureExtractionLayer.Params().Set( name='cell_{}'.format(i), sub=sub)) # Parallelize partitions using pipeline. p = PipeliningLayer.Params().Set( name='pipeline', num_micro_batches=num_micro_batches, cell_tpl=cell_tpl) # Fake input x = tf.ones([2, 3]) # Construct weights. w = tf.get_variable( 'w', shape=[2, 3], initializer=tf.constant_initializer([[1] * 3] * 2)) mdl = p.cls(p) y = mdl.FPropDefaultTheta(x * w) # Construct loss function such that gradients = final activation. loss = tf.reduce_sum(y) grads = py_utils.ComputeGradients(loss, py_utils.NestedMap(w=w)) tf.global_variables_initializer().run() y_val = sess.run(y) grads_val = sess.run(grads)['w'][1] self.assertAllClose(y_val, grads_val)
def testDropoutInRecurrent(self, graph_seed): with self.session() as sess: if graph_seed: tf.random.set_seed(12345) l = lingvo_layers.DeterministicDropoutLayer.Params().Set( name='dropout', keep_prob=0.7).Instantiate() # Input variable. w = tf.get_variable('w', shape=[9, 20], initializer=tf.ones_initializer()) sess.run(tf.global_variables_initializer()) prev_sum = np.sum(np.isclose(sess.run(w), 0.0)) def Step(theta, state0, unused_inputs): w = l.FProp(theta.l, state0.w) state1 = py_utils.NestedMap(w=w) return state1, py_utils.NestedMap() acc, final = recurrent.Recurrent( theta=py_utils.NestedMap(l=l.theta), state0=py_utils.NestedMap(w=w), inputs=py_utils.NestedMap(x=tf.zeros([4])), cell_fn=Step) acc_w = sess.run(acc.w) self.assertLen(acc_w, 4) for acc_w_i in acc_w: next_sum = np.sum(np.isclose(acc_w_i, 0.0)) self.assertGreater(next_sum, prev_sum) prev_sum = next_sum # Construct loss function such that gradients = final activation. loss = tf.reduce_sum(final.w) grads = py_utils.ComputeGradients(loss, py_utils.NestedMap(w=w)) w_val, grads_val = sess.run([final.w, grads.w.grad]) self.assertAllClose(w_val, grads_val)
def testAccumulator(self): # testAccumulator compares # - explicit averaging of independently computed var_grads1 and # var_grads2, # - Accumulator(SGD) optimizer effectively doing this over 2 steps. np.random.seed(12345) np_input1 = np.random.normal(0.1, 0.5, [2, 4, 3]) np.random.seed(12346) np_input2 = np.random.normal(0.1, 0.5, [2, 4, 3]) with self.session(use_gpu=True, graph=tf.Graph()) as sess: tf.random.set_seed(123456) params = layers.ProjectionLayer.Params() params.name = 'proj' params.dtype = tf.float64 params.input_dim = 3 params.output_dim = 2 params.params_init = py_utils.WeightInit.Gaussian(0.01, 123456) params.batch_norm = False proj_layer = layers.ProjectionLayer(params) inputs1 = tf.placeholder(shape=[2, 4, 3], dtype=tf.float64) in_padding1 = tf.zeros([2, 4, 1], dtype=tf.float64) inputs2 = tf.placeholder(shape=[2, 4, 3], dtype=tf.float64) in_padding2 = tf.zeros([2, 4, 1], dtype=tf.float64) output1 = proj_layer.FPropDefaultTheta(inputs1, in_padding1) output2 = proj_layer.FPropDefaultTheta(inputs2, in_padding2) loss1 = tf.reduce_sum(output1) loss2 = tf.reduce_sum(output2) var_grads1 = py_utils.ComputeGradients(loss1, proj_layer.vars) var_grads2 = py_utils.ComputeGradients(loss2, proj_layer.vars) op = optimizer.SGD.Params() opt = op.Instantiate() lr = 1e-1 with tf.control_dependencies([loss1, loss2]): var_update_op1 = opt.Apply( lr, py_utils.ApplyGradMultiplier(var_grads1, 1. / 2.)) with tf.control_dependencies([var_update_op1]): var_update_op2 = opt.Apply( lr, py_utils.ApplyGradMultiplier(var_grads2, 1. / 2.)) self.evaluate(tf.global_variables_initializer()) vars1 = self.evaluate(proj_layer.vars.Flatten()) loss1_1, grads1_1, loss1_2, grads1_2 = sess.run( [ loss1, var_grads1.Transform(tuple), loss2, var_grads2.Transform(tuple) ], feed_dict={ inputs1: np_input1, inputs2: np_input2, }, ) sess.run([var_update_op2], feed_dict={ inputs1: np_input1, inputs2: np_input2, }) vars1_1 = self.evaluate(proj_layer.vars.Flatten()) with self.session(use_gpu=True, graph=tf.Graph()) as sess: tf.random.set_seed(123456) params = layers.ProjectionLayer.Params() params.name = 'proj' params.dtype = tf.float64 params.input_dim = 3 params.output_dim = 2 params.params_init = py_utils.WeightInit.Gaussian(0.01, 123456) params.batch_norm = False proj_layer = layers.ProjectionLayer(params) in_padding1 = tf.zeros([2, 4, 1], dtype=tf.float64) inputs1 = tf.placeholder(shape=[2, 4, 3], dtype=tf.float64) output1 = proj_layer.FPropDefaultTheta(inputs1, in_padding1) loss = tf.reduce_sum(output1) var_grads = py_utils.ComputeGradients(loss, proj_layer.vars) op = optimizer.Accumulator.Params().Set( accum_steps=2, dtype=tf.float64, optimizer_tpl=optimizer.SGD.Params()) opt = op.Instantiate() lr = 1e-1 with cluster_factory.ForTestingWorker(add_summary=True): var_update_op = opt.Apply(lr, var_grads) increment_global_step_op = tf.assign_add( py_utils.GetOrCreateGlobalStepVar(), 1) self.evaluate(tf.global_variables_initializer()) vars2 = self.evaluate(proj_layer.vars.Flatten()) loss2_1, grads2_1 = sess.run( [loss, var_grads.Transform(tuple)], feed_dict={ inputs1: np_input1, }) loss2_2, grads2_2 = sess.run( [loss, var_grads.Transform(tuple)], feed_dict={ inputs1: np_input2, }) acc_0 = self.evaluate([ v for v in tf.global_variables() if 'grad_accumulator' in v.name ])[0] sess.run([var_update_op], feed_dict={ inputs1: np_input1, }) acc_1 = self.evaluate([ v for v in tf.global_variables() if 'grad_accumulator' in v.name ])[0] vars2_intermediate = self.evaluate(proj_layer.vars.Flatten()) self.evaluate(increment_global_step_op) sess.run([var_update_op], feed_dict={ inputs1: np_input2, }) acc_2 = self.evaluate([ v for v in tf.global_variables() if 'grad_accumulator' in v.name ])[0] vars2_1 = self.evaluate(proj_layer.vars.Flatten()) summary = tf.Summary.FromString( self.evaluate(tf.summary.merge_all())) tf.logging.info(f'summary: {summary}') self.assertEqual(summary.value[0].tag, 'sgd_lr') self.assertAllClose(vars1, vars2) self.assertAllClose(acc_0, np.zeros_like(acc_0)) self.assertAllClose(acc_1, grads2_1['w'][1]) self.assertAllClose(acc_2, np.zeros_like(acc_0)) self.assertAllClose(loss1_1, loss2_1) self.assertAllClose(loss1_2, loss2_2) self.assertAllClose(grads1_1, grads2_1) self.assertAllClose(grads1_2, grads2_2) self.assertAllClose(vars1, vars2_intermediate) self.assertAllClose(vars2[0], grads2_1['w'][0]) self.assertAllClose(vars2[0], grads2_2['w'][0]) self.assertAllClose( vars1[0] - 0.5 * lr * (grads1_1['w'][1] + grads1_2['w'][1]), vars1_1[0]) self.assertAllClose( vars2[0] - 0.5 * lr * (grads2_1['w'][1] + grads2_2['w'][1]), vars2_1[0]) self.assertAllClose(vars2, vars2_intermediate) self.assertAllClose(vars1_1, vars2_1)
def Apply(self, loss, vmap, gradient_mask=None, gradient_adjuster=None): """Computes updates on 'vmap' to optimize 'loss'. TODO(rpang): explore merging gradient_mask and gradient_adjuster. Args: loss: A scalar Tensor. vmap: A `.NestedMap` object containing variables to optimize. gradient_mask: if not None, a dict mapping variable names to a 0/1 scalar. gradient_adjuster: if not None, a function that mutates a given var_grads. Returns: (op, stats), where op is a tf.Operation to update variables and stats is a NestedMap containing 'has_nan_or_inf' and 'eval_metrics'. """ # We apply gradients outside the name_scope to maintain backwards # compatibility on variables created by self.optimizer.Apply(). p = self.params pos = re.compile( p.bprop_variable_filter) if p.bprop_variable_filter else None neg = re.compile( p.bprop_variable_exclusion) if p.bprop_variable_exclusion else None def VariableFilter(v): """Returns True if variable v should be optimized by this learner.""" if pos and not pos.search(v.name): tf.logging.info('%s: disabled by bprop_variable_filter: %s', p.name, v.name) return False if neg and neg.search(v.name): tf.logging.info('%s: disabled by bprop_variable_exclusion: %s', p.name, v.name) return False return True vmap = vmap.Filter(VariableFilter) for v in vmap.Flatten(): tf.logging.info('%s: bprop variable: %s', p.name, v.name) # Compute gradients. var_grads = py_utils.ComputeGradients(loss, vmap, p.grad_aggregation_method, p.colocate_gradients_with_ops, p.gate_gradients) # L2 regularizer. if p.l2_regularizer_weight is not None: l2_loss, var_grads = py_utils.AdjustGradientsWithLpLoss( var_grads, p.l2_regularizer_weight, p=2.0) self._AddScalarSummary('l2_loss', l2_loss) # L1 regularizer. if p.l1_regularizer_weight is not None: l1_loss, var_grads = py_utils.AdjustGradientsWithLpLoss( var_grads, p.l1_regularizer_weight, p=1.0) self._AddScalarSummary('l1_loss', l1_loss) # Mask gradients only if the mask is set. if gradient_mask: var_grads = py_utils.MaskGradients(var_grads, gradient_mask) # Apply gradient clipping. scaled_vars = self.ScaleGradients(var_grads, gradient_adjuster) has_nan_or_inf = scaled_vars.has_nan_or_inf var_grads = scaled_vars.final_var_grads # Histogram summary. summary_utils.CollectVarHistogram(var_grads) self._var_grads = var_grads assert self.theta.global_step is not None, self.theta lrs = self.lr_schedule.Value(self.theta.global_step) self._AddScalarSummary('lr_schedule', lrs) lr = p.learning_rate * lrs var_update_op = self.optimizer.Apply(lr, var_grads) stats = py_utils.NestedMap(has_nan_or_inf=has_nan_or_inf, eval_metrics=self._eval_metrics) return var_update_op, stats
def ComputeGradients(self, loss, vmap, *args, **kwargs): """Allows subclasses control computation of gradients.""" kwargs['use_bf16_gradients_ar'] = self.params.use_bf16_gradients_ar return py_utils.ComputeGradients(loss, vmap, *args, **kwargs)
def _BPropForVariables(self, vmap): """Constructs the backward graph for the given variables. Args: vmap: a `.NestedMap` of variables. """ p = self.params tp = p.train # Compute gradients. self._var_grads = py_utils.ComputeGradients(self.loss, vmap) # L2 regularizer. if tp.l2_regularizer_weight is not None: l2_loss, self._var_grads = py_utils.AdjustGradientsWithLpLoss( self._var_grads, tp.l2_regularizer_weight, p=2.0) summary_utils.scalar(p, 'l2_loss', l2_loss) # L1 regularizer. if tp.l1_regularizer_weight is not None: l1_loss, self._var_grads = py_utils.AdjustGradientsWithLpLoss( self._var_grads, tp.l1_regularizer_weight, p=1.0) summary_utils.scalar(p, 'l1_loss', l1_loss) # Mask gradients only if the mask is set. if self._per_input_gradient_mask: bprop_onehot = self.input_generator.GetInputSourceOneHot() self._var_grads = py_utils.MaskGradients( self._var_grads, self._per_input_gradient_mask, bprop_onehot) # Apply gradient clipping. has_nan_or_inf, _, self._var_grads = self.ScaleGradients(self._var_grads) # Histogram summary. summary_utils.CollectVarHistogram(p, self._var_grads) lrs = self.lr_schedule.Value(self._global_step) summary_utils.scalar(p, 'lr_schedule', lrs) lr = tp.learning_rate * lrs var_update_op = self.optimizer.Apply(lr, self._var_grads) increment_global_step_ops = [] with tf.colocate_with(self._shared_global_step): increment_global_step_ops.append( tf.assign_add(self._shared_global_step, 1)) if self._task_global_step: with tf.colocate_with(self._task_global_step): increment_global_step_ops.append( tf.assign_add(self._task_global_step, 1)) increment_global_steps = tf.group(*increment_global_step_ops) relevant_bn_updates, _ = py_utils.FindRelevantBatchNormUpdates( self.loss, tf.get_collection(py_utils.BATCH_NORM_UPDATES)) batch_norm_updates = tf.group(*relevant_bn_updates) # Update stats. stats_updates = tf.group( self.IncrementTotalSamples(), self.IncrementTotalNans(tf.to_int32(has_nan_or_inf))) # Post training step update. post_training_step_updates = self.PostTrainingStepUpdate(self._global_step) # Get the op to update the weight masks and thresholds mask_update_op = self._GetMaskUpdateOp() # TODO(rpang): try to structure _train_op as: # tf.cond(skip_step, <only update skip stats>, <all updates>) # so that we skip all other updates when a step is skipped. # if p.contiguous: var_update_op = tf.group(var_update_op, self.last_state_group_op) self._train_op = tf.group( var_update_op, batch_norm_updates, stats_updates, post_training_step_updates, increment_global_steps, mask_update_op, name='train')
def ComputeGradients(self, loss, vmap, *args, **kwargs): """Allows subclasses control computation of gradients.""" return py_utils.ComputeGradients(loss, vmap, *args, **kwargs)
def testAccumulator(self): # testAccumulator compares # - explicit averaging of independently computed var_grads1 and # var_grads2, # - Accumulator(SGD) optimizer effectively doing this over 2 steps. np.random.seed(12345) np_input1 = np.random.normal(0.1, 0.5, [2, 4, 3]) np.random.seed(12346) np_input2 = np.random.normal(0.1, 0.5, [2, 4, 3]) g1 = tf.Graph() with g1.as_default(): tf.set_random_seed(123456) params = layers.ProjectionLayer.Params() params.name = 'proj' params.dtype = tf.float64 params.input_dim = 3 params.output_dim = 2 params.params_init = py_utils.WeightInit.Gaussian(0.01, 123456) params.is_eval = False params.batch_norm = False proj_layer = layers.ProjectionLayer(params) inputs1 = tf.placeholder(shape=[2, 4, 3], dtype=tf.float64) in_padding1 = tf.zeros([2, 4, 1], dtype=tf.float64) inputs2 = tf.placeholder(shape=[2, 4, 3], dtype=tf.float64) in_padding2 = tf.zeros([2, 4, 1], dtype=tf.float64) output1 = proj_layer.FPropDefaultTheta(inputs1, in_padding1) output2 = proj_layer.FPropDefaultTheta(inputs2, in_padding2) loss1 = tf.reduce_sum(output1) loss2 = tf.reduce_sum(output2) var_grads1 = py_utils.ComputeGradients(loss1, proj_layer.vars) var_grads2 = py_utils.ComputeGradients(loss2, proj_layer.vars) op = optimizer.SGD.Params().Set(add_summary=False) opt = op.cls(op) lr = 1e-1 with tf.control_dependencies([loss1, loss2]): var_update_op1 = opt.Apply( lr, py_utils.ApplyGradMultiplier(var_grads1, 1. / 2.)) with tf.control_dependencies([var_update_op1]): var_update_op2 = opt.Apply( lr, py_utils.ApplyGradMultiplier(var_grads2, 1. / 2.)) init_op = tf.global_variables_initializer() with self.session(use_gpu=True, graph=g1) as sess: sess.run(init_op) vars1 = sess.run(proj_layer.vars.Flatten()) loss1_1, grads1_1, loss1_2, grads1_2 = sess.run( [loss1, var_grads1, loss2, var_grads2], feed_dict={ inputs1: np_input1, inputs2: np_input2, }) sess.run( [var_update_op2], feed_dict={ inputs1: np_input1, inputs2: np_input2, }) vars1_1 = sess.run(proj_layer.vars.Flatten()) g2 = tf.Graph() with g2.as_default(): tf.set_random_seed(123456) params = layers.ProjectionLayer.Params() params.name = 'proj' params.dtype = tf.float64 params.input_dim = 3 params.output_dim = 2 params.params_init = py_utils.WeightInit.Gaussian(0.01, 123456) params.is_eval = False params.batch_norm = False proj_layer = layers.ProjectionLayer(params) in_padding1 = tf.zeros([2, 4, 1], dtype=tf.float64) inputs1 = tf.placeholder(shape=[2, 4, 3], dtype=tf.float64) output1 = proj_layer.FPropDefaultTheta(inputs1, in_padding1) loss = tf.reduce_sum(output1) var_grads = py_utils.ComputeGradients(loss, proj_layer.vars) op = optimizer.Accumulator.Params().Set( accum_steps=2, dtype=tf.float64, optimizer_tpl=optimizer.SGD.Params().Set(add_summary=False)) opt = op.cls(op) lr = 1e-1 var_update_op = opt.Apply(lr, var_grads) init_op = tf.global_variables_initializer() global_step = py_utils.GetOrCreateGlobalStep() increment_global_step_op = tf.assign_add(global_step, 1) with self.session(use_gpu=True, graph=g2) as sess: sess.run(init_op) vars2, global_step = sess.run([proj_layer.vars.Flatten(), global_step]) loss2_1, grads2_1 = sess.run( [loss, var_grads], feed_dict={ inputs1: np_input1, }) loss2_2, grads2_2 = sess.run( [loss, var_grads], feed_dict={ inputs1: np_input2, }) acc_0 = sess.run( [v for v in tf.global_variables() if 'grad_accumulator' in v.name])[0] sess.run( [var_update_op], feed_dict={ inputs1: np_input1, }) acc_1 = sess.run( [v for v in tf.global_variables() if 'grad_accumulator' in v.name])[0] vars2_intermediate = sess.run(proj_layer.vars.Flatten()) sess.run(increment_global_step_op) sess.run( [var_update_op], feed_dict={ inputs1: np_input2, }) acc_2 = sess.run( [v for v in tf.global_variables() if 'grad_accumulator' in v.name])[0] vars2_1 = sess.run(proj_layer.vars.Flatten()) self.assertAllClose(vars1, vars2) self.assertAllClose(acc_0, np.zeros_like(acc_0)) self.assertAllClose(acc_1, grads2_1['w'][1]) self.assertAllClose(acc_2, np.zeros_like(acc_0)) self.assertAllClose(loss1_1, loss2_1) self.assertAllClose(loss1_2, loss2_2) self.assertAllClose(grads1_1, grads2_1) self.assertAllClose(grads1_2, grads2_2) self.assertAllClose(vars1, vars2_intermediate) self.assertAllClose(vars2[0], grads2_1['w'][0]) self.assertAllClose(vars2[0], grads2_2['w'][0]) self.assertAllClose( vars1[0] - 0.5 * lr * (grads1_1['w'][1] + grads1_2['w'][1]), vars1_1[0]) self.assertAllClose( vars2[0] - 0.5 * lr * (grads2_1['w'][1] + grads2_2['w'][1]), vars2_1[0]) self.assertAllClose(vars2, vars2_intermediate) self.assertAllClose(vars1_1, vars2_1)