def testCausualNormalizedDepthwiseConv2DLayerBackProp(self): with self.session(use_gpu=True) as sess: output = self._testNormalizedDepthwiseConv2DHelper( is_causal=True, dropconnect_prob=0.1) loss = tf.reduce_sum(output) all_vars = tf.trainable_variables() grads = tf.gradients(loss, all_vars) self.evaluate(tf.global_variables_initializer()) sym_grads = [sg.eval() for sg in grads] num_grads = [ test_utils.ComputeNumericGradient(sess, loss, v) for v in all_vars ] for sg, ng in zip(sym_grads, num_grads): self.assertAllClose(sg, ng, rtol=1e-02, atol=1e-02)
def _verify_timestep_counts(self, num_splits, auto_partition=False, micro_batch_size=None): num_micro_batches = 8 batch_size = 16 with self.session(graph=tf.Graph()) as sess: tf.random.set_seed(1245) inputs = tf.random.uniform([batch_size, 8, 8, 1], seed=12345) if auto_partition: layers = [ _SimpyLayer.Params().Set(name='layer_{}'.format(i)) for i in range(16) ] net = PipeliningLayer.Params().Set( name='pipeline', num_micro_batches=num_micro_batches, cell_tpl=_Partition(layers, num_splits, tshape.Shape([batch_size, 8, 8, 1]))).Instantiate() else: net = _BuildDummyPipelineCnn( num_splits=num_splits, micro_batch_size=micro_batch_size, num_micro_batches=num_micro_batches) endpoints = net.FPropDefaultTheta(inputs) if isinstance(endpoints, (list, tuple)): logits, aux_logits = endpoints else: logits = endpoints aux_logits = None loss = tf.reduce_mean(logits) grads = tf.gradients(loss, tf.trainable_variables()) grad_norm = tf.sqrt(py_utils.SumSquared(grads)) ts = net.GetAccumulatorValues().Flatten() sess.run(tf.global_variables_initializer()) grad_norm_val, ts_vals = sess.run([grad_norm, ts]) test_utils.CompareToGoldenSingleFloat(self, 0.268087, grad_norm_val) # Accumulator values should be equal to number of time steps in pipeline. for ts_val in list(ts_vals): expected_ts = num_micro_batches if num_splits > 1 else 1 self.assertEqual(ts_val, expected_ts) if aux_logits is not None: aux_logit_tensor = sess.run(aux_logits) self.assertEqual(aux_logit_tensor.shape, (batch_size, 8, 8, 1))
def testPaddedMeanGrad(self): b = builder_lib.ModelBuilderBase() p = b._Seq('seq', b._FeaturesFC('fc', 5, 10), b._PaddedMean('p')) l = p.Instantiate() _, x = self._getNestedMapTestData() y = l.FPropDefaultTheta(x) loss = tf.reduce_sum(y) all_vars = tf.trainable_variables() grads = tf.gradients(loss, all_vars) with self.session(): self.evaluate(tf.global_variables_initializer()) np_grads = self.evaluate(grads) for np_grad in np_grads: self.assertTrue(np.all(np.isfinite(np_grad)))
def Grad(h0, w, b, x, padding, h1, dh1): del b dh1_orig = dh1 dh1 = _ApplyPadding(padding, dh1, tf.zeros_like(dh1, dtype=dh1.dtype)) # We hand-roll the gradient for the 2nd half of the cell as a demo. # h1 = tf.sigmoid(xw + b) # 𝛔'(x) = ((1 - 𝛔(x)) * 𝛔(x)) dxwb = (dh1 * (1 - h1) * h1) dxw, db = dxwb, tf.reduce_sum(dxwb, axis=0) # Uses tf.gradient for the 1nd half of the cell as a demo. xw = py_utils.Matmul(tf.concat([x, h0], axis=1), w) dh0, dx, dw = tf.gradients(ys=[xw], xs=[h0, x, w], grad_ys=[dxw]) dh0 = _ApplyPadding(padding, dh0, dh1_orig) return dh0, dx, dw, db
def _DecoderGradientCheckerHelper(self, decoder_cls, feed_att_context_to_softmax=False): with self.session(use_gpu=True, graph=tf.Graph()) as sess: tf.set_random_seed(_TF_RANDOM_SEED) p = self._DecoderParams(dtype=tf.float64, decoder_cls=decoder_cls) p.feed_attention_context_vec_to_softmax = feed_att_context_to_softmax dec = p.Instantiate() encoder_outputs, targets = self._Inputs(dtype=tf.float64) loss, _ = dec.FPropDefaultTheta(encoder_outputs, targets).metrics['loss'] all_vars = tf.trainable_variables() grads = tf.gradients(loss, all_vars) print('num of vars ', len(all_vars)) def DenseGrad(var, grad): if isinstance(grad, tf.Tensor): return grad elif isinstance(grad, tf.IndexedSlices): return tf.unsorted_segment_sum(grad.values, grad.indices, tf.shape(var)[0]) grads = [DenseGrad(x, y) for x, y in zip(all_vars, grads)] tf.global_variables_initializer().run() symbolic_grads = [gd.eval() for gd in grads] numerical_grads = [] for v in all_vars: numerical_grads.append( test_utils.ComputeNumericGradient(sess, loss, v, delta=1e-5)) rets = {} for v, x, y in zip(all_vars, symbolic_grads, numerical_grads): print('symbolic_grads, numerical_grads :', v.name) print(x) print(y) self.assertAllClose(x, y) rets[v.name] = x return rets
def test_entmax_loss_generate_right_gradient(self): inputs = tf.constant([[0.5, 1.0, 2.0]] * 3) labels = tf.constant([0, 1, 2]) expected_loss_gradient = tf.constant( [[[-0.97671956, 0.16207013, 0.8146494], [0.02328045, -0.83792984, 0.8146494], [0.02328045, 0.16207013, -0.1853506]]]) # Convert to the matrix with given depth, e.g. the vocabulary size. labels = tf.one_hot(labels, depth=3) expected_loss = tf.constant(2.692692) entmax_loss_val = tf.reduce_sum(entmax.entmax_loss( labels, inputs, 1.5)) entmax_loss_gradient_val = tf.gradients(entmax_loss_val, inputs) with self.session(use_gpu=False) as sess: loss_output = sess.run(entmax_loss_val) gradient_output = sess.run(entmax_loss_gradient_val) self.assertAllClose(expected_loss, loss_output) self.assertAllClose(expected_loss_gradient, gradient_output)
def testBasicGrad(self): p = self._testParams(dtype=tf.float64) with self.session(use_gpu=False, graph=tf.Graph()) as sess: lm = p.Instantiate() inputs, paddings, targets = self._testInputs(dtype=tf.float64) xent_output, _ = lm.FPropDefaultTheta( inputs=inputs, paddings=paddings, labels=py_utils.NestedMap(class_weights=1 - paddings, class_ids=targets)) lm_vars = lm.vars.Flatten() # Now add the backward graph. grads = tf.gradients(xent_output.avg_xent, lm_vars) tf.global_variables_initializer().run() self.assertEqual(len(lm_vars), len(grads)) for x, grad_x in zip(lm_vars, grads): grad_symbolic = sess.run(grad_x) grad_numeric = test_utils.ComputeNumericGradient( sess, xent_output.avg_xent, x, delta=1e-6) self.assertAllClose(grad_symbolic, grad_numeric, atol=0.005)
def testSimpleStacked(self): g = tf.Graph() with g.as_default(): devices = ['/cpu:0'] * 3 cell_fns = [self.Poly, self.Identity, self.Identity] cell_grads = [None] * 3 cell_outs = [lambda x: x] * 3 cell_out_grads = [lambda x: x] * 3 w0 = tf.constant(2.) w1 = tf.constant(0.) w2 = tf.constant(0.) thetas = [ py_utils.NestedMap(x=w0), py_utils.NestedMap(x=w1), py_utils.NestedMap(x=w2) ] init_states = [py_utils.NestedMap(s=tf.constant(0.))] * 3 inputs = py_utils.NestedMap( c=tf.constant([1., 2., 1., 0.]), padding=tf.constant([0., 0., 0., 1.])) output, _ = recurrent.StackedRecurrent( devices=devices, cell_fns=cell_fns, cell_grads=cell_grads, cell_outs=cell_outs, cell_out_grads=cell_out_grads, thetas=thetas, init_states=init_states, inputs=inputs) dw0, dw1, dw2 = tf.gradients(tf.reduce_sum(output.s), [w0, w1, w2]) with self.session(graph=g): (output, dw0, dw1, dw2) = self.evaluate([output.s, dw0, dw1, dw2]) self.assertAllClose(output, [1., 4., 9., 0.]) self.assertAllClose(dw2, 0.) self.assertAllClose(dw1, 0.) self.assertAllClose(dw0, 7.)
def testMoEFFLayer(self, use_fflayer_start_moe, use_fflayer_end_moe, expected_aux_loss): p = self._GetParams() if use_fflayer_start_moe: p.fflayer_start_tpl = gshard_builder.MoEBuilder.Params().Set( e_dim=2, c_dim=2, num_devices=2) if use_fflayer_end_moe: p.fflayer_end_tpl = gshard_builder.MoEBuilder.Params().Set( e_dim=2, c_dim=2, num_devices=2) l = p.Instantiate() if use_fflayer_start_moe: self.assertNotIn('fflayer_start', l.children) self.assertIn('fflayer_start_moe', l.children) if use_fflayer_end_moe: self.assertNotIn('fflayer_end', l.children) self.assertIn('fflayer_end_moe', l.children) inputs, paddings = self._GetInputs() inputs = tf.convert_to_tensor(inputs) paddings = tf.convert_to_tensor(paddings) in_nmap = py_utils.NestedMap(features=inputs, paddings=paddings) in_nmap.aux_loss = tf.convert_to_tensor(0., py_utils.FPropDtype(p)) out_nmap = l.FPropDefaultTheta(in_nmap) self.assertIn('aux_loss', out_nmap) loss = tf.reduce_sum(out_nmap.features) + 0.01 * out_nmap.aux_loss grads = tf.gradients( loss, l.vars.Flatten(), unconnected_gradients=tf.UnconnectedGradients.ZERO) with self.session() as sess: tf.global_variables_initializer().run() out_vals = sess.run(out_nmap.features) grad_vals = sess.run(grads) self.assertEqual(out_nmap.aux_loss.shape, ()) aux_loss = sess.run(out_nmap.aux_loss) self.assertAlmostEqual(expected_aux_loss, aux_loss, places=5) print([x.shape for x in out_vals]) print([g.shape for g in grad_vals])
def testRepeatLayerNestedMapBProp(self): """Tests RepeatLayer having body layer with mutable NestedMap.""" repeat = 3 input_dim, output_dim = 2, 2 # RepeatLayer with NestedMap in `body` FProp input signature. p = layers.RepeatLayer.Params().Set( name='nested_map_recurrent', repeat=repeat, body=FCLayerTestNestedMapFPropInput.Params().Set( input_dim=input_dim, output_dim=output_dim)) # Verify FProp output equality for both layers. layer = p.Instantiate() with self.session() as sess: tf.random.set_seed(24332) sess.run(tf.global_variables_initializer()) inputs = tf.random.normal(shape=[2, 5, 2]) paddings = tf.zeros((2, 5, 1)) args = py_utils.NestedMap(features=inputs, paddings=paddings) outputs = layer.FPropDefaultTheta(args) # Mutate 'args' before the bprop. args.features = tf.transpose(args.features, [1, 0, 2]) args.paddings = tf.transpose(args.paddings, [1, 0, 2]) in_grads = tf.gradients(ys=tf.nest.flatten(outputs), xs=[inputs]) sess.run(in_grads)
def testBProp(self): vocab, time, batch = 7, 4, 3 p = self._MoeLmParams(vocab, True) p.dtype = tf.float64 with self.session(graph=tf.Graph()) as sess: np.random.seed(54321) tf.random.set_seed(123456) lm = p.Instantiate() inputs, paddings, labels = self._GetData(vocab, time, batch) sess.run(tf.global_variables_initializer()) xent_output, _ = lm.FPropDefaultTheta( inputs=inputs, paddings=tf.cast(paddings, p.dtype), state0=lm.zero_state(lm.theta, batch), labels=labels) lm_vars = lm.vars.Flatten() # Now add the backward graph. grads = tf.gradients(xent_output.avg_xent, lm_vars) for i, x in enumerate(grads): if isinstance(x, tf.IndexedSlices): grads[i] = tf.math.unsorted_segment_sum( x.values, x.indices, x.dense_shape[0]) tf.global_variables_initializer().run() self.assertEqual(len(lm_vars), len(grads)) step = 11 # Speed up the test. for x, grad_x in zip(lm_vars, grads): grad_symbolic = sess.run(grad_x) grad_numeric = test_utils.ComputeNumericGradient( sess, xent_output.avg_xent, x, step=step, delta=1e-6) self.assertAllClose( grad_symbolic.reshape([-1])[::step], grad_numeric.reshape([-1])[::step])
def _testDecoderFPropGradientCheckerHelper(self, func_inline=False): config = tf.ConfigProto(graph_options=tf.GraphOptions( optimizer_options=tf.OptimizerOptions( do_function_inlining=func_inline))) with self.session(use_gpu=False, config=config) as sess: tf.set_random_seed(8372749040) np.random.seed(274854) vn_config = py_utils.VariationalNoiseParams(None, False, False) p = self._DecoderParams(vn_config) p.dtype = tf.float64 dec = p.Instantiate() src_seq_len = 5 src_enc = tf.constant(np.random.uniform(size=(src_seq_len, 2, 8)), tf.float64) src_enc_padding = tf.constant( [[0.0, 0.0], [0.0, 0.0], [0.0, 0.0], [0.0, 1.0], [1.0, 1.0]], dtype=tf.float64) encoder_outputs = py_utils.NestedMap(encoded=src_enc, padding=src_enc_padding) target_ids = tf.transpose( tf.constant([[0, 1, 2, 3], [1, 2, 3, 4], [10, 11, 12, 15], [5, 6, 7, 8], [10, 5, 2, 5]], dtype=tf.int32)) target_labels = tf.transpose( tf.constant([[0, 1, 2, 3], [1, 2, 3, 4], [10, 11, 12, 13], [5, 7, 8, 10], [10, 5, 2, 4]], dtype=tf.int32)) target_paddings = tf.transpose( tf.constant([[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 1, 0], [0, 1, 0, 0], [1, 1, 1, 1]], dtype=tf.float64)) target_transcripts = tf.constant( ['abcd', 'bcde', 'klmp', 'fghi', 'kfcf']) target_weights = 1.0 - target_paddings targets = py_utils.NestedMap({ 'ids': target_ids, 'labels': target_labels, 'weights': target_weights, 'paddings': target_paddings, 'transcripts': target_transcripts, }) metrics = dec.FPropDefaultTheta(encoder_outputs, targets).metrics loss = metrics['loss'][0] all_vars = tf.trainable_variables() grads = tf.gradients(loss, all_vars) def DenseGrad(var, grad): if isinstance(grad, tf.Tensor): return grad elif isinstance(grad, tf.IndexedSlices): return tf.unsorted_segment_sum(grad.values, grad.indices, tf.shape(var)[0]) dense_grads = [DenseGrad(x, y) for (x, y) in zip(all_vars, grads)] tf.global_variables_initializer().run() test_utils.CompareToGoldenSingleFloat(self, 3.458078, loss.eval()) # Second run to make sure the function is determistic. test_utils.CompareToGoldenSingleFloat(self, 3.458078, loss.eval()) symbolic_grads = [x.eval() for x in dense_grads if x is not None] numerical_grads = [] for v in all_vars: numerical_grads.append( test_utils.ComputeNumericGradient(sess, loss, v)) for x, y in zip(symbolic_grads, numerical_grads): self.assertAllClose(x, y)
def testStatefulEmbeddingStep(self): with self.session(use_gpu=False): tf.random.set_seed(398847392) p = embedding_steps.StatefulEmbeddingStep.Params().Set( name='emb_step', num_prev_tokens=1, include_current_token=False, target_sos_id=1, embedding_dim=3, ) p.emb.Set( vocab_size=10, embedding_dim=3, max_num_shards=1, params_init=py_utils.WeightInit.Gaussian(0.01), ) p.emb.vn.global_vn = False p.emb.vn.per_step_vn = False emb = p.Instantiate() # Verify that nothing bad happens when these methods are called. packed = emb.PrepareExternalInputs(None, None) state0 = emb.ZeroState(emb.theta, packed, 2) # Test FProp of the unit out1, state1 = emb.FProp( emb.theta, packed, py_utils.NestedMap(inputs=[tf.constant([4, 3], tf.int32)]), tf.constant([0.0], dtype=tf.float32), state0) self.evaluate(tf.global_variables_initializer()) out1, state1 = self.evaluate([out1, state1]) self.assertAllEqual(state1.prev_ids, np.array([[4], [3]])) self.assertAllClose( out1.output, np.array([[-0.00740041, -0.00746862, 0.00093992], [-0.00740041, -0.00746862, 0.00093992]])) # Test FProp and BProp when integrated with Recurrent() def _FProp(theta, state0, inputs): embedding, state1 = emb.FProp( theta, None, inputs, None, state0, ) state1.embedding = embedding.output return state1, py_utils.NestedMap() inputs = py_utils.NestedMap(inputs=[ tf.constant([[1., 2.], [3., 2.], [0., 1.], [2., 3.], [3., 0.]]) ]) acc, _ = recurrent.Recurrent( emb.theta, state0, inputs, _FProp, ) loss = tf.math.l2_normalize(acc.embedding) grad = tf.gradients(loss, emb.emb.theta.wm[0]) self.evaluate(tf.global_variables_initializer()) acc_, _, grad_ = self.evaluate([acc, emb.emb.theta.wm[0], grad]) prev_ids_expected = np.array([ [[1.], [2.]], [[3.], [2.]], [[0.], [1.]], [[2.], [3.]], [[3.], [0.]], ]) grad_expected = np.array([[21.952698, 20.50312, 19.037958], [79.622116, 72.15271, 106.34329], [41.631985, 70.19292, 75.52608], [53.644493, 36.28507, 36.64856], [0., 0., 0.], [0., 0., 0.], [0., 0., 0.], [0., 0., 0.], [0., 0., 0.], [0., 0., 0.]]) self.assertAllClose(acc_.prev_ids, prev_ids_expected) self.assertAllClose(grad_[0], grad_expected)
def _Gradient(inputs, _, original_grad): # Compute the gradients for each loss w.r.t. the inputs. # TODO(jngiam): Look into whether TF dedups this computation. per_loss_grads = [] for loss, _ in self._losses: per_loss_grad = tf.gradients(loss, self._output_tensor)[0] if per_loss_grad is None: tf.logging.warning( 'Loss %s did not result in a gradient during ' 'GradDrop computation.', loss) else: per_loss_grads.append(per_loss_grad) if not per_loss_grads: raise ValueError('No valid gradients for GradDrop.') # Multiply the gradients with the inputs. grads = per_loss_grads if p.use_input_sign_only: input_abs = tf.abs( tf.cast(tf.abs(inputs) <= p.epsilon, tf.float32) + inputs) grads = [grad * ((inputs) / (input_abs)) for grad in grads] else: grads = [grad * inputs for grad in grads] # Sum gradient over batch, assuming that batch is always on dim 0. if p.marginalize_batch_dim: grads = [ tf.reduce_sum(grad, axis=0, keepdims=True) for grad in grads ] # First discretize all gradients into their sign values. grad_sign_positive = [ tf.cast(grad > 0.0, tf.float32) for grad in grads ] grad_sign_negative = [ tf.cast(grad < 0.0, tf.float32) for grad in grads ] # Calculate the probability of positive gradients based on equation (1) # in the GradDrop paper. grad_abs_sum = tf.add_n([tf.abs(grad) for grad in grads]) prob_pos = (tf.add_n(grads) / (2. * grad_abs_sum + p.epsilon)) # Implementation of different scales for the keep function. Larger # scales result in steeper keep functions. prob_pos *= p.keep_prob_function_scale if p.keep_prob_function == 'sigmoid': # Standard sigmoid has derivative of 0.25 at 0 so the factor of 4.0 # allows the function scale in sigmoid to be compatible with the # function scale in the linear case. prob_pos = tf.sigmoid(4.0 * prob_pos) elif p.keep_prob_function == 'linear': prob_pos += 0.5 # The main, default mode of GradDrop. Only gradients of one sign are kept, # and which sign is calculated via equation (1) of the main paper. prob_pos = tf.cast(prob_pos >= tf.random.uniform(prob_pos.shape), tf.float32) - 0.5 grad_masks = [ (gsp - gsn) * prob_pos >= 0 for (gsn, gsp) in zip(grad_sign_negative, grad_sign_positive) ] # This diag value gives us the percentage of grads which are kept. gradmask_diag = [tf.cast(gm, tf.float32) for gm in grad_masks] diag = tf.reduce_mean(tf.add_n(gradmask_diag) / len(grad_masks)) summary_utils.scalar('average_grad_mask', diag) leak_ratios = [leak_ratio for _, leak_ratio in self._losses] transformed_per_loss_grads = [ grad * (leak + (1.0 - leak) * tf.cast(grad_mask, tf.float32)) for (leak, grad, grad_mask) in zip(leak_ratios, per_loss_grads, grad_masks) ] transformed_grad = tf.cast(tf.add_n(transformed_per_loss_grads), original_grad.dtype) if not p.keep_gradnorm_constant: return transformed_grad transformed_grad_norm = tf.sqrt(tf.reduce_sum(transformed_grad**2)) original_grad_norm = tf.sqrt(tf.reduce_sum(original_grad**2)) return transformed_grad * original_grad_norm / ( transformed_grad_norm + p.epsilon)
def _BuildStackedRecurrentElman(self, seqlen, trailing_pad_len, batch, dims, layers): tf.set_random_seed(342462) np.random.seed(32540) seqlen += trailing_pad_len dtype = tf.float64 def CreateTheta(): return py_utils.NestedMap( w=tf.constant(np.random.uniform(0, 0.2, (2 * dims, dims)), dtype=dtype), b=tf.constant(np.random.uniform(0, 0.2, (dims, )), dtype=dtype)) def CreateState0(): return py_utils.NestedMap(h=tf.constant(np.random.uniform( 0, 0.2, (batch, dims)), dtype=dtype), padding=tf.constant([[0]] * batch, dtype=dtype)) devices = ['/cpu:0'] * layers cell_fns = [self.Elman] * layers cell_grads = [self.ElmanGrad] * layers cell_outs = [self.ElmanOut] * layers cell_out_grads = [self.ElmanOutGrad] * layers thetas = [CreateTheta() for _ in range(layers)] init_states = [CreateState0() for _ in range(layers)] padding = np.zeros((seqlen, batch, 1)) padding[-trailing_pad_len:, :, :] = 1. padding[-trailing_pad_len - 3:-trailing_pad_len - 1, :, :] = 1. inputs = py_utils.NestedMap(x=tf.constant(np.random.uniform( 0, 0.2, (seqlen, batch, dims)), dtype=dtype), padding=tf.constant(padding, dtype=dtype)) output, _ = recurrent.StackedRecurrent(devices=devices, cell_fns=cell_fns, cell_grads=cell_grads, cell_outs=cell_outs, cell_out_grads=cell_out_grads, thetas=thetas, init_states=init_states, inputs=inputs) o = output.x if 'padding' in inputs: o *= (1 - inputs.padding) loss = tf.reduce_sum(tf.square(o)) xs = recurrent.Flatten(thetas + [py_utils.NestedMap(x=inputs.x)]) dxs = tf.gradients(ys=loss, xs=xs) # Reference implementation using Recurrent(). ref = inputs for i in range(layers): ref = self.ElmanOut( recurrent.Recurrent(cell_fn=cell_fns[i], cell_grad=cell_grads[i], theta=thetas[i], state0=init_states[i], inputs=ref)[0]) return ref.x, output.x, loss, xs, dxs
def testBasicWithAccumulator(self): with self.session() as sess: p = _SampleAccumulatorLayer.Params() p.name = 'sample' accum_layer = _SampleAccumulatorLayer(p) accum_obj = accum_layer.accumulators[accum_layer.accumulator_name] theta = py_utils.NestedMap() theta.x = tf.constant(2.0) state = py_utils.NestedMap() state.value = tf.constant(0.0) state.x_power = tf.constant(1.0) inputs = py_utils.NestedMap() inputs.coeff = tf.constant([1., 2., 3.]) def _CellFn(theta, state, inputs): print('TEST ACCUM WITHIN CellFn = ', accum_obj.GetValue()) accum_obj.Update(inputs.coeff) return _Poly(theta, state, inputs) # By doing one accumulate prior to recurrent, we ensure that incoming # recurrent state is preserved. accum_obj.Update(10.) # x = 2 # 1 + 2*x + 3*x^2 ret = recurrent.Recurrent(theta, state, inputs, _CellFn, accumulator_layer=accum_layer) # Verify bprop. y = ret[1].value dx, d_coeff = tf.gradients(ys=[y], xs=[theta.x, inputs.coeff]) dx_val, d_coeff_val = sess.run([dx, d_coeff]) # 2 + 6*x self.assertAllClose(dx_val, 14.) self.assertAllClose(d_coeff_val, [1., 2., 4.]) # acc = [1, 1+2x, 1+2x+3x^2] # sum(acc) = 3 + 4x + 3x^2 acc = ret[0].value dx, d_coeff = tf.gradients(ys=[tf.reduce_sum(acc)], xs=[theta.x, inputs.coeff]) dx_val, d_coeff_val = sess.run([dx, d_coeff]) # 4 + 6*x self.assertAllClose(dx_val, 16.) self.assertAllClose(d_coeff_val, [3., 4., 4.]) # Verify fprop. (acc, state), accum_obj_value = sess.run( (ret, accum_obj.GetValue())) # Verify that accumulators don't change fprop results. self.assertAllClose(acc.value, [1., 5., 17.]) self.assertAllClose(acc.x_power, [2., 4., 8.]) self.assertAllClose(state.value, 17.) self.assertAllClose(state.x_power, 8.) # Verify accumulator (should be 10 (initial increment) + 1 + 2 + 3). self.assertEqual(0, accum_obj._disable_count) self.assertAllClose([accum_obj_value], [16.0])
def ReverseAndGrad(self, theta, outputs, d_outputs, f_seed, g_seed, *extra_inputs): """Implements Algorithm 1 in the revnet paper. Args: theta: A NestedMap object containing weights' values of this layer and its children layers. outputs: A NestedMap: .split1 and .split2 corresponding to y1 and y2. d_outputs: A NestedMap: .split1 and .split2 corresponding to dy1 and dy2, the total derivatives. f_seed: Scalar tensor. The step seed used in forward for the f block. g_seed: Scalar tensor. The step seed used in forward for the g block. The step seeds are needed for deterministic randomness, e.g. to ensure dropout generate the same random mask in forward and reverse_grad. *extra_inputs: additional inputs that will be passed to both f and g. No gradient will be computed for these inputs. Returns: A tuple of NestedMaps - inputs: .split1 and .split2 corresponding to x1 and x2. - d_inputs: .split1 and .split2 corresponding to dx1 and dx2, the total derivatives with respect to inputs. - d_theta: has the same structure as theta. The total derivatives with respect to weights. """ # Stop gradient on the outputs to avoid circular symbolic dependency. y1 = tf.stop_gradient(outputs.split1) y2 = tf.stop_gradient(outputs.split2) dy1 = d_outputs.split1 dy2 = d_outputs.split2 # Computes the reverse. z1 = y1 py_utils.ResetStepSeed(g_seed) gz1 = self.g_block.FProp(theta.g_block, z1, *extra_inputs) x2 = y2 - gz1 py_utils.ResetStepSeed(f_seed) fx2 = self.f_block.FProp(theta.f_block, x2, *extra_inputs) x1 = z1 - fx2 # Computes the gradients. dz1 = dy1 + tf.gradients(gz1, z1, dy2)[0] dx2 = dy2 + tf.gradients(fx2, x2, dz1)[0] dgw = tf.gradients(gz1, theta.g_block.Flatten(), dy2, unconnected_gradients=tf.UnconnectedGradients.ZERO) dgw = theta.g_block.Pack(dgw) dfw = tf.gradients(fx2, theta.f_block.Flatten(), dz1, unconnected_gradients=tf.UnconnectedGradients.ZERO) dfw = theta.f_block.Pack(dfw) return (py_utils.NestedMap(split1=x1, split2=x2), py_utils.NestedMap(split1=dz1, split2=dx2), py_utils.NestedMap(f_block=dfw, g_block=dgw, global_step=tf.zeros_like( theta.global_step)))