コード例 #1
0
 def testCausualNormalizedDepthwiseConv2DLayerBackProp(self):
     with self.session(use_gpu=True) as sess:
         output = self._testNormalizedDepthwiseConv2DHelper(
             is_causal=True, dropconnect_prob=0.1)
         loss = tf.reduce_sum(output)
         all_vars = tf.trainable_variables()
         grads = tf.gradients(loss, all_vars)
         self.evaluate(tf.global_variables_initializer())
         sym_grads = [sg.eval() for sg in grads]
         num_grads = [
             test_utils.ComputeNumericGradient(sess, loss, v)
             for v in all_vars
         ]
         for sg, ng in zip(sym_grads, num_grads):
             self.assertAllClose(sg, ng, rtol=1e-02, atol=1e-02)
コード例 #2
0
    def _verify_timestep_counts(self,
                                num_splits,
                                auto_partition=False,
                                micro_batch_size=None):
        num_micro_batches = 8
        batch_size = 16
        with self.session(graph=tf.Graph()) as sess:
            tf.random.set_seed(1245)
            inputs = tf.random.uniform([batch_size, 8, 8, 1], seed=12345)
            if auto_partition:
                layers = [
                    _SimpyLayer.Params().Set(name='layer_{}'.format(i))
                    for i in range(16)
                ]
                net = PipeliningLayer.Params().Set(
                    name='pipeline',
                    num_micro_batches=num_micro_batches,
                    cell_tpl=_Partition(layers, num_splits,
                                        tshape.Shape([batch_size, 8, 8,
                                                      1]))).Instantiate()
            else:
                net = _BuildDummyPipelineCnn(
                    num_splits=num_splits,
                    micro_batch_size=micro_batch_size,
                    num_micro_batches=num_micro_batches)
            endpoints = net.FPropDefaultTheta(inputs)
            if isinstance(endpoints, (list, tuple)):
                logits, aux_logits = endpoints
            else:
                logits = endpoints
                aux_logits = None
            loss = tf.reduce_mean(logits)
            grads = tf.gradients(loss, tf.trainable_variables())
            grad_norm = tf.sqrt(py_utils.SumSquared(grads))
            ts = net.GetAccumulatorValues().Flatten()

            sess.run(tf.global_variables_initializer())
            grad_norm_val, ts_vals = sess.run([grad_norm, ts])
            test_utils.CompareToGoldenSingleFloat(self, 0.268087,
                                                  grad_norm_val)
            # Accumulator values should be equal to number of time steps in pipeline.
            for ts_val in list(ts_vals):
                expected_ts = num_micro_batches if num_splits > 1 else 1
                self.assertEqual(ts_val, expected_ts)
            if aux_logits is not None:
                aux_logit_tensor = sess.run(aux_logits)
                self.assertEqual(aux_logit_tensor.shape, (batch_size, 8, 8, 1))
コード例 #3
0
    def testPaddedMeanGrad(self):
        b = builder_lib.ModelBuilderBase()
        p = b._Seq('seq', b._FeaturesFC('fc', 5, 10), b._PaddedMean('p'))
        l = p.Instantiate()

        _, x = self._getNestedMapTestData()
        y = l.FPropDefaultTheta(x)
        loss = tf.reduce_sum(y)

        all_vars = tf.trainable_variables()
        grads = tf.gradients(loss, all_vars)

        with self.session():
            self.evaluate(tf.global_variables_initializer())
            np_grads = self.evaluate(grads)
            for np_grad in np_grads:
                self.assertTrue(np.all(np.isfinite(np_grad)))
コード例 #4
0
    def Grad(h0, w, b, x, padding, h1, dh1):
        del b
        dh1_orig = dh1
        dh1 = _ApplyPadding(padding, dh1, tf.zeros_like(dh1, dtype=dh1.dtype))

        # We hand-roll the gradient for the 2nd half of the cell as a demo.
        # h1 = tf.sigmoid(xw + b)
        # 𝛔'(x) = ((1 - 𝛔(x)) * 𝛔(x))
        dxwb = (dh1 * (1 - h1) * h1)
        dxw, db = dxwb, tf.reduce_sum(dxwb, axis=0)

        # Uses tf.gradient for the 1nd half of the cell as a demo.
        xw = py_utils.Matmul(tf.concat([x, h0], axis=1), w)
        dh0, dx, dw = tf.gradients(ys=[xw], xs=[h0, x, w], grad_ys=[dxw])

        dh0 = _ApplyPadding(padding, dh0, dh1_orig)

        return dh0, dx, dw, db
コード例 #5
0
ファイル: decoder_test.py プロジェクト: snsun/lingvo
    def _DecoderGradientCheckerHelper(self,
                                      decoder_cls,
                                      feed_att_context_to_softmax=False):
        with self.session(use_gpu=True, graph=tf.Graph()) as sess:
            tf.set_random_seed(_TF_RANDOM_SEED)
            p = self._DecoderParams(dtype=tf.float64, decoder_cls=decoder_cls)
            p.feed_attention_context_vec_to_softmax = feed_att_context_to_softmax
            dec = p.Instantiate()
            encoder_outputs, targets = self._Inputs(dtype=tf.float64)
            loss, _ = dec.FPropDefaultTheta(encoder_outputs,
                                            targets).metrics['loss']
            all_vars = tf.trainable_variables()
            grads = tf.gradients(loss, all_vars)
            print('num of vars ', len(all_vars))

            def DenseGrad(var, grad):
                if isinstance(grad, tf.Tensor):
                    return grad
                elif isinstance(grad, tf.IndexedSlices):
                    return tf.unsorted_segment_sum(grad.values, grad.indices,
                                                   tf.shape(var)[0])

            grads = [DenseGrad(x, y) for x, y in zip(all_vars, grads)]

            tf.global_variables_initializer().run()
            symbolic_grads = [gd.eval() for gd in grads]
            numerical_grads = []
            for v in all_vars:
                numerical_grads.append(
                    test_utils.ComputeNumericGradient(sess,
                                                      loss,
                                                      v,
                                                      delta=1e-5))

            rets = {}
            for v, x, y in zip(all_vars, symbolic_grads, numerical_grads):
                print('symbolic_grads, numerical_grads :', v.name)
                print(x)
                print(y)
                self.assertAllClose(x, y)
                rets[v.name] = x

            return rets
コード例 #6
0
    def test_entmax_loss_generate_right_gradient(self):
        inputs = tf.constant([[0.5, 1.0, 2.0]] * 3)
        labels = tf.constant([0, 1, 2])
        expected_loss_gradient = tf.constant(
            [[[-0.97671956, 0.16207013, 0.8146494],
              [0.02328045, -0.83792984, 0.8146494],
              [0.02328045, 0.16207013, -0.1853506]]])
        # Convert to the matrix with given depth, e.g. the vocabulary size.
        labels = tf.one_hot(labels, depth=3)
        expected_loss = tf.constant(2.692692)
        entmax_loss_val = tf.reduce_sum(entmax.entmax_loss(
            labels, inputs, 1.5))
        entmax_loss_gradient_val = tf.gradients(entmax_loss_val, inputs)

        with self.session(use_gpu=False) as sess:
            loss_output = sess.run(entmax_loss_val)
            gradient_output = sess.run(entmax_loss_gradient_val)
            self.assertAllClose(expected_loss, loss_output)
            self.assertAllClose(expected_loss_gradient, gradient_output)
コード例 #7
0
    def testBasicGrad(self):
        p = self._testParams(dtype=tf.float64)
        with self.session(use_gpu=False, graph=tf.Graph()) as sess:
            lm = p.Instantiate()
            inputs, paddings, targets = self._testInputs(dtype=tf.float64)
            xent_output, _ = lm.FPropDefaultTheta(
                inputs=inputs,
                paddings=paddings,
                labels=py_utils.NestedMap(class_weights=1 - paddings,
                                          class_ids=targets))

            lm_vars = lm.vars.Flatten()
            # Now add the backward graph.
            grads = tf.gradients(xent_output.avg_xent, lm_vars)

            tf.global_variables_initializer().run()
            self.assertEqual(len(lm_vars), len(grads))
            for x, grad_x in zip(lm_vars, grads):
                grad_symbolic = sess.run(grad_x)
                grad_numeric = test_utils.ComputeNumericGradient(
                    sess, xent_output.avg_xent, x, delta=1e-6)
                self.assertAllClose(grad_symbolic, grad_numeric, atol=0.005)
コード例 #8
0
ファイル: recurrent_test.py プロジェクト: galv/lingvo-copy
  def testSimpleStacked(self):
    g = tf.Graph()
    with g.as_default():
      devices = ['/cpu:0'] * 3
      cell_fns = [self.Poly, self.Identity, self.Identity]
      cell_grads = [None] * 3
      cell_outs = [lambda x: x] * 3
      cell_out_grads = [lambda x: x] * 3
      w0 = tf.constant(2.)
      w1 = tf.constant(0.)
      w2 = tf.constant(0.)
      thetas = [
          py_utils.NestedMap(x=w0),
          py_utils.NestedMap(x=w1),
          py_utils.NestedMap(x=w2)
      ]
      init_states = [py_utils.NestedMap(s=tf.constant(0.))] * 3
      inputs = py_utils.NestedMap(
          c=tf.constant([1., 2., 1., 0.]),
          padding=tf.constant([0., 0., 0., 1.]))
      output, _ = recurrent.StackedRecurrent(
          devices=devices,
          cell_fns=cell_fns,
          cell_grads=cell_grads,
          cell_outs=cell_outs,
          cell_out_grads=cell_out_grads,
          thetas=thetas,
          init_states=init_states,
          inputs=inputs)
      dw0, dw1, dw2 = tf.gradients(tf.reduce_sum(output.s), [w0, w1, w2])

    with self.session(graph=g):
      (output, dw0, dw1, dw2) = self.evaluate([output.s, dw0, dw1, dw2])

    self.assertAllClose(output, [1., 4., 9., 0.])
    self.assertAllClose(dw2, 0.)
    self.assertAllClose(dw1, 0.)
    self.assertAllClose(dw0, 7.)
コード例 #9
0
    def testMoEFFLayer(self, use_fflayer_start_moe, use_fflayer_end_moe,
                       expected_aux_loss):
        p = self._GetParams()
        if use_fflayer_start_moe:
            p.fflayer_start_tpl = gshard_builder.MoEBuilder.Params().Set(
                e_dim=2, c_dim=2, num_devices=2)
        if use_fflayer_end_moe:
            p.fflayer_end_tpl = gshard_builder.MoEBuilder.Params().Set(
                e_dim=2, c_dim=2, num_devices=2)
        l = p.Instantiate()
        if use_fflayer_start_moe:
            self.assertNotIn('fflayer_start', l.children)
            self.assertIn('fflayer_start_moe', l.children)
        if use_fflayer_end_moe:
            self.assertNotIn('fflayer_end', l.children)
            self.assertIn('fflayer_end_moe', l.children)
        inputs, paddings = self._GetInputs()
        inputs = tf.convert_to_tensor(inputs)
        paddings = tf.convert_to_tensor(paddings)
        in_nmap = py_utils.NestedMap(features=inputs, paddings=paddings)
        in_nmap.aux_loss = tf.convert_to_tensor(0., py_utils.FPropDtype(p))
        out_nmap = l.FPropDefaultTheta(in_nmap)
        self.assertIn('aux_loss', out_nmap)
        loss = tf.reduce_sum(out_nmap.features) + 0.01 * out_nmap.aux_loss
        grads = tf.gradients(
            loss,
            l.vars.Flatten(),
            unconnected_gradients=tf.UnconnectedGradients.ZERO)

        with self.session() as sess:
            tf.global_variables_initializer().run()
            out_vals = sess.run(out_nmap.features)
            grad_vals = sess.run(grads)
            self.assertEqual(out_nmap.aux_loss.shape, ())
            aux_loss = sess.run(out_nmap.aux_loss)
            self.assertAlmostEqual(expected_aux_loss, aux_loss, places=5)
            print([x.shape for x in out_vals])
            print([g.shape for g in grad_vals])
コード例 #10
0
 def testRepeatLayerNestedMapBProp(self):
   """Tests RepeatLayer having body layer with mutable NestedMap."""
   repeat = 3
   input_dim, output_dim = 2, 2
   # RepeatLayer with NestedMap in `body` FProp input signature.
   p = layers.RepeatLayer.Params().Set(
       name='nested_map_recurrent',
       repeat=repeat,
       body=FCLayerTestNestedMapFPropInput.Params().Set(
           input_dim=input_dim, output_dim=output_dim))
   # Verify FProp output equality for both layers.
   layer = p.Instantiate()
   with self.session() as sess:
     tf.random.set_seed(24332)
     sess.run(tf.global_variables_initializer())
     inputs = tf.random.normal(shape=[2, 5, 2])
     paddings = tf.zeros((2, 5, 1))
     args = py_utils.NestedMap(features=inputs, paddings=paddings)
     outputs = layer.FPropDefaultTheta(args)
     # Mutate 'args' before the bprop.
     args.features = tf.transpose(args.features, [1, 0, 2])
     args.paddings = tf.transpose(args.paddings, [1, 0, 2])
     in_grads = tf.gradients(ys=tf.nest.flatten(outputs), xs=[inputs])
     sess.run(in_grads)
コード例 #11
0
    def testBProp(self):
        vocab, time, batch = 7, 4, 3
        p = self._MoeLmParams(vocab, True)
        p.dtype = tf.float64

        with self.session(graph=tf.Graph()) as sess:
            np.random.seed(54321)
            tf.random.set_seed(123456)
            lm = p.Instantiate()
            inputs, paddings, labels = self._GetData(vocab, time, batch)
            sess.run(tf.global_variables_initializer())
            xent_output, _ = lm.FPropDefaultTheta(
                inputs=inputs,
                paddings=tf.cast(paddings, p.dtype),
                state0=lm.zero_state(lm.theta, batch),
                labels=labels)

            lm_vars = lm.vars.Flatten()
            # Now add the backward graph.
            grads = tf.gradients(xent_output.avg_xent, lm_vars)

            for i, x in enumerate(grads):
                if isinstance(x, tf.IndexedSlices):
                    grads[i] = tf.math.unsorted_segment_sum(
                        x.values, x.indices, x.dense_shape[0])

            tf.global_variables_initializer().run()
            self.assertEqual(len(lm_vars), len(grads))
            step = 11  # Speed up the test.
            for x, grad_x in zip(lm_vars, grads):
                grad_symbolic = sess.run(grad_x)
                grad_numeric = test_utils.ComputeNumericGradient(
                    sess, xent_output.avg_xent, x, step=step, delta=1e-6)
                self.assertAllClose(
                    grad_symbolic.reshape([-1])[::step],
                    grad_numeric.reshape([-1])[::step])
コード例 #12
0
ファイル: decoder_test.py プロジェクト: xueyongfu/lingvo
    def _testDecoderFPropGradientCheckerHelper(self, func_inline=False):
        config = tf.ConfigProto(graph_options=tf.GraphOptions(
            optimizer_options=tf.OptimizerOptions(
                do_function_inlining=func_inline)))
        with self.session(use_gpu=False, config=config) as sess:
            tf.set_random_seed(8372749040)
            np.random.seed(274854)
            vn_config = py_utils.VariationalNoiseParams(None, False, False)
            p = self._DecoderParams(vn_config)
            p.dtype = tf.float64

            dec = p.Instantiate()
            src_seq_len = 5
            src_enc = tf.constant(np.random.uniform(size=(src_seq_len, 2, 8)),
                                  tf.float64)
            src_enc_padding = tf.constant(
                [[0.0, 0.0], [0.0, 0.0], [0.0, 0.0], [0.0, 1.0], [1.0, 1.0]],
                dtype=tf.float64)
            encoder_outputs = py_utils.NestedMap(encoded=src_enc,
                                                 padding=src_enc_padding)
            target_ids = tf.transpose(
                tf.constant([[0, 1, 2, 3], [1, 2, 3, 4], [10, 11, 12, 15],
                             [5, 6, 7, 8], [10, 5, 2, 5]],
                            dtype=tf.int32))
            target_labels = tf.transpose(
                tf.constant([[0, 1, 2, 3], [1, 2, 3, 4], [10, 11, 12, 13],
                             [5, 7, 8, 10], [10, 5, 2, 4]],
                            dtype=tf.int32))
            target_paddings = tf.transpose(
                tf.constant([[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 1, 0],
                             [0, 1, 0, 0], [1, 1, 1, 1]],
                            dtype=tf.float64))
            target_transcripts = tf.constant(
                ['abcd', 'bcde', 'klmp', 'fghi', 'kfcf'])
            target_weights = 1.0 - target_paddings

            targets = py_utils.NestedMap({
                'ids': target_ids,
                'labels': target_labels,
                'weights': target_weights,
                'paddings': target_paddings,
                'transcripts': target_transcripts,
            })
            metrics = dec.FPropDefaultTheta(encoder_outputs, targets).metrics
            loss = metrics['loss'][0]
            all_vars = tf.trainable_variables()
            grads = tf.gradients(loss, all_vars)

            def DenseGrad(var, grad):
                if isinstance(grad, tf.Tensor):
                    return grad
                elif isinstance(grad, tf.IndexedSlices):
                    return tf.unsorted_segment_sum(grad.values, grad.indices,
                                                   tf.shape(var)[0])

            dense_grads = [DenseGrad(x, y) for (x, y) in zip(all_vars, grads)]

            tf.global_variables_initializer().run()

            test_utils.CompareToGoldenSingleFloat(self, 3.458078, loss.eval())
            # Second run to make sure the function is determistic.
            test_utils.CompareToGoldenSingleFloat(self, 3.458078, loss.eval())

            symbolic_grads = [x.eval() for x in dense_grads if x is not None]
            numerical_grads = []
            for v in all_vars:
                numerical_grads.append(
                    test_utils.ComputeNumericGradient(sess, loss, v))

            for x, y in zip(symbolic_grads, numerical_grads):
                self.assertAllClose(x, y)
コード例 #13
0
    def testStatefulEmbeddingStep(self):
        with self.session(use_gpu=False):
            tf.random.set_seed(398847392)
            p = embedding_steps.StatefulEmbeddingStep.Params().Set(
                name='emb_step',
                num_prev_tokens=1,
                include_current_token=False,
                target_sos_id=1,
                embedding_dim=3,
            )
            p.emb.Set(
                vocab_size=10,
                embedding_dim=3,
                max_num_shards=1,
                params_init=py_utils.WeightInit.Gaussian(0.01),
            )
            p.emb.vn.global_vn = False
            p.emb.vn.per_step_vn = False
            emb = p.Instantiate()

            # Verify that nothing bad happens when these methods are called.
            packed = emb.PrepareExternalInputs(None, None)
            state0 = emb.ZeroState(emb.theta, packed, 2)

            # Test FProp of the unit
            out1, state1 = emb.FProp(
                emb.theta, packed,
                py_utils.NestedMap(inputs=[tf.constant([4, 3], tf.int32)]),
                tf.constant([0.0], dtype=tf.float32), state0)
            self.evaluate(tf.global_variables_initializer())
            out1, state1 = self.evaluate([out1, state1])

            self.assertAllEqual(state1.prev_ids, np.array([[4], [3]]))
            self.assertAllClose(
                out1.output,
                np.array([[-0.00740041, -0.00746862, 0.00093992],
                          [-0.00740041, -0.00746862, 0.00093992]]))

            # Test FProp and BProp when integrated with Recurrent()
            def _FProp(theta, state0, inputs):
                embedding, state1 = emb.FProp(
                    theta,
                    None,
                    inputs,
                    None,
                    state0,
                )
                state1.embedding = embedding.output
                return state1, py_utils.NestedMap()

            inputs = py_utils.NestedMap(inputs=[
                tf.constant([[1., 2.], [3., 2.], [0., 1.], [2., 3.], [3., 0.]])
            ])
            acc, _ = recurrent.Recurrent(
                emb.theta,
                state0,
                inputs,
                _FProp,
            )
            loss = tf.math.l2_normalize(acc.embedding)
            grad = tf.gradients(loss, emb.emb.theta.wm[0])
            self.evaluate(tf.global_variables_initializer())
            acc_, _, grad_ = self.evaluate([acc, emb.emb.theta.wm[0], grad])
            prev_ids_expected = np.array([
                [[1.], [2.]],
                [[3.], [2.]],
                [[0.], [1.]],
                [[2.], [3.]],
                [[3.], [0.]],
            ])
            grad_expected = np.array([[21.952698, 20.50312, 19.037958],
                                      [79.622116, 72.15271, 106.34329],
                                      [41.631985, 70.19292, 75.52608],
                                      [53.644493, 36.28507, 36.64856],
                                      [0., 0., 0.], [0., 0., 0.], [0., 0., 0.],
                                      [0., 0., 0.], [0., 0., 0.], [0., 0.,
                                                                   0.]])
            self.assertAllClose(acc_.prev_ids, prev_ids_expected)
            self.assertAllClose(grad_[0], grad_expected)
コード例 #14
0
ファイル: graddrop.py プロジェクト: vcj-huy/lingvo
        def _Gradient(inputs, _, original_grad):

            # Compute the gradients for each loss w.r.t. the inputs.
            # TODO(jngiam): Look into whether TF dedups this computation.
            per_loss_grads = []
            for loss, _ in self._losses:
                per_loss_grad = tf.gradients(loss, self._output_tensor)[0]
                if per_loss_grad is None:
                    tf.logging.warning(
                        'Loss %s did not result in a gradient during '
                        'GradDrop computation.', loss)
                else:
                    per_loss_grads.append(per_loss_grad)

            if not per_loss_grads:
                raise ValueError('No valid gradients for GradDrop.')

            # Multiply the gradients with the inputs.
            grads = per_loss_grads
            if p.use_input_sign_only:
                input_abs = tf.abs(
                    tf.cast(tf.abs(inputs) <= p.epsilon, tf.float32) + inputs)
                grads = [grad * ((inputs) / (input_abs)) for grad in grads]
            else:
                grads = [grad * inputs for grad in grads]

            # Sum gradient over batch, assuming that batch is always on dim 0.
            if p.marginalize_batch_dim:
                grads = [
                    tf.reduce_sum(grad, axis=0, keepdims=True)
                    for grad in grads
                ]

            # First discretize all gradients into their sign values.
            grad_sign_positive = [
                tf.cast(grad > 0.0, tf.float32) for grad in grads
            ]
            grad_sign_negative = [
                tf.cast(grad < 0.0, tf.float32) for grad in grads
            ]

            # Calculate the probability of positive gradients based on equation (1)
            # in the GradDrop paper.
            grad_abs_sum = tf.add_n([tf.abs(grad) for grad in grads])
            prob_pos = (tf.add_n(grads) / (2. * grad_abs_sum + p.epsilon))
            # Implementation of different scales for the keep function. Larger
            # scales result in steeper keep functions.
            prob_pos *= p.keep_prob_function_scale

            if p.keep_prob_function == 'sigmoid':
                # Standard sigmoid has derivative of 0.25 at 0 so the factor of 4.0
                # allows the function scale in sigmoid to be compatible with the
                # function scale in the linear case.
                prob_pos = tf.sigmoid(4.0 * prob_pos)
            elif p.keep_prob_function == 'linear':
                prob_pos += 0.5

            # The main, default mode of GradDrop. Only gradients of one sign are kept,
            # and which sign is calculated via equation (1) of the main paper.
            prob_pos = tf.cast(prob_pos >= tf.random.uniform(prob_pos.shape),
                               tf.float32) - 0.5
            grad_masks = [
                (gsp - gsn) * prob_pos >= 0
                for (gsn, gsp) in zip(grad_sign_negative, grad_sign_positive)
            ]

            # This diag value gives us the percentage of grads which are kept.
            gradmask_diag = [tf.cast(gm, tf.float32) for gm in grad_masks]
            diag = tf.reduce_mean(tf.add_n(gradmask_diag) / len(grad_masks))
            summary_utils.scalar('average_grad_mask', diag)
            leak_ratios = [leak_ratio for _, leak_ratio in self._losses]
            transformed_per_loss_grads = [
                grad * (leak + (1.0 - leak) * tf.cast(grad_mask, tf.float32))
                for (leak, grad,
                     grad_mask) in zip(leak_ratios, per_loss_grads, grad_masks)
            ]

            transformed_grad = tf.cast(tf.add_n(transformed_per_loss_grads),
                                       original_grad.dtype)

            if not p.keep_gradnorm_constant:
                return transformed_grad

            transformed_grad_norm = tf.sqrt(tf.reduce_sum(transformed_grad**2))
            original_grad_norm = tf.sqrt(tf.reduce_sum(original_grad**2))
            return transformed_grad * original_grad_norm / (
                transformed_grad_norm + p.epsilon)
コード例 #15
0
    def _BuildStackedRecurrentElman(self, seqlen, trailing_pad_len, batch,
                                    dims, layers):
        tf.set_random_seed(342462)
        np.random.seed(32540)

        seqlen += trailing_pad_len
        dtype = tf.float64

        def CreateTheta():
            return py_utils.NestedMap(
                w=tf.constant(np.random.uniform(0, 0.2, (2 * dims, dims)),
                              dtype=dtype),
                b=tf.constant(np.random.uniform(0, 0.2, (dims, )),
                              dtype=dtype))

        def CreateState0():
            return py_utils.NestedMap(h=tf.constant(np.random.uniform(
                0, 0.2, (batch, dims)),
                                                    dtype=dtype),
                                      padding=tf.constant([[0]] * batch,
                                                          dtype=dtype))

        devices = ['/cpu:0'] * layers
        cell_fns = [self.Elman] * layers
        cell_grads = [self.ElmanGrad] * layers
        cell_outs = [self.ElmanOut] * layers
        cell_out_grads = [self.ElmanOutGrad] * layers
        thetas = [CreateTheta() for _ in range(layers)]
        init_states = [CreateState0() for _ in range(layers)]
        padding = np.zeros((seqlen, batch, 1))
        padding[-trailing_pad_len:, :, :] = 1.
        padding[-trailing_pad_len - 3:-trailing_pad_len - 1, :, :] = 1.
        inputs = py_utils.NestedMap(x=tf.constant(np.random.uniform(
            0, 0.2, (seqlen, batch, dims)),
                                                  dtype=dtype),
                                    padding=tf.constant(padding, dtype=dtype))
        output, _ = recurrent.StackedRecurrent(devices=devices,
                                               cell_fns=cell_fns,
                                               cell_grads=cell_grads,
                                               cell_outs=cell_outs,
                                               cell_out_grads=cell_out_grads,
                                               thetas=thetas,
                                               init_states=init_states,
                                               inputs=inputs)
        o = output.x
        if 'padding' in inputs:
            o *= (1 - inputs.padding)
        loss = tf.reduce_sum(tf.square(o))

        xs = recurrent.Flatten(thetas + [py_utils.NestedMap(x=inputs.x)])
        dxs = tf.gradients(ys=loss, xs=xs)

        # Reference implementation using Recurrent().
        ref = inputs
        for i in range(layers):
            ref = self.ElmanOut(
                recurrent.Recurrent(cell_fn=cell_fns[i],
                                    cell_grad=cell_grads[i],
                                    theta=thetas[i],
                                    state0=init_states[i],
                                    inputs=ref)[0])
        return ref.x, output.x, loss, xs, dxs
コード例 #16
0
    def testBasicWithAccumulator(self):

        with self.session() as sess:

            p = _SampleAccumulatorLayer.Params()
            p.name = 'sample'
            accum_layer = _SampleAccumulatorLayer(p)
            accum_obj = accum_layer.accumulators[accum_layer.accumulator_name]

            theta = py_utils.NestedMap()
            theta.x = tf.constant(2.0)
            state = py_utils.NestedMap()
            state.value = tf.constant(0.0)
            state.x_power = tf.constant(1.0)
            inputs = py_utils.NestedMap()
            inputs.coeff = tf.constant([1., 2., 3.])

            def _CellFn(theta, state, inputs):
                print('TEST ACCUM WITHIN CellFn = ', accum_obj.GetValue())
                accum_obj.Update(inputs.coeff)
                return _Poly(theta, state, inputs)

            # By doing one accumulate prior to recurrent, we ensure that incoming
            # recurrent state is preserved.
            accum_obj.Update(10.)

            # x = 2
            # 1 + 2*x + 3*x^2
            ret = recurrent.Recurrent(theta,
                                      state,
                                      inputs,
                                      _CellFn,
                                      accumulator_layer=accum_layer)

            # Verify bprop.
            y = ret[1].value
            dx, d_coeff = tf.gradients(ys=[y], xs=[theta.x, inputs.coeff])
            dx_val, d_coeff_val = sess.run([dx, d_coeff])

            # 2 + 6*x
            self.assertAllClose(dx_val, 14.)
            self.assertAllClose(d_coeff_val, [1., 2., 4.])

            # acc = [1, 1+2x, 1+2x+3x^2]
            # sum(acc) = 3 + 4x + 3x^2
            acc = ret[0].value
            dx, d_coeff = tf.gradients(ys=[tf.reduce_sum(acc)],
                                       xs=[theta.x, inputs.coeff])
            dx_val, d_coeff_val = sess.run([dx, d_coeff])
            # 4 + 6*x
            self.assertAllClose(dx_val, 16.)
            self.assertAllClose(d_coeff_val, [3., 4., 4.])

            # Verify fprop.
            (acc, state), accum_obj_value = sess.run(
                (ret, accum_obj.GetValue()))

            # Verify that accumulators don't change fprop results.
            self.assertAllClose(acc.value, [1., 5., 17.])
            self.assertAllClose(acc.x_power, [2., 4., 8.])
            self.assertAllClose(state.value, 17.)
            self.assertAllClose(state.x_power, 8.)

            # Verify accumulator (should be 10 (initial increment) + 1 + 2 + 3).
            self.assertEqual(0, accum_obj._disable_count)
            self.assertAllClose([accum_obj_value], [16.0])
コード例 #17
0
    def ReverseAndGrad(self, theta, outputs, d_outputs, f_seed, g_seed,
                       *extra_inputs):
        """Implements Algorithm 1 in the revnet paper.

    Args:
      theta: A NestedMap object containing weights' values of this layer and its
        children layers.
      outputs: A NestedMap: .split1 and .split2 corresponding to y1 and y2.
      d_outputs: A NestedMap: .split1 and .split2 corresponding to dy1 and dy2,
        the total derivatives.
      f_seed: Scalar tensor. The step seed used in forward for the f block.
      g_seed: Scalar tensor. The step seed used in forward for the g block. The
        step seeds are needed for deterministic randomness, e.g. to ensure
        dropout generate the same random mask in forward and reverse_grad.
      *extra_inputs: additional inputs that will be passed to both f and g. No
        gradient will be computed for these inputs.

    Returns:
      A tuple of NestedMaps

      - inputs: .split1 and .split2 corresponding to x1 and x2.
      - d_inputs: .split1 and .split2 corresponding to dx1 and dx2, the total
        derivatives with respect to inputs.
      - d_theta: has the same structure as theta. The total derivatives with
        respect to weights.

    """

        # Stop gradient on the outputs to avoid circular symbolic dependency.
        y1 = tf.stop_gradient(outputs.split1)
        y2 = tf.stop_gradient(outputs.split2)
        dy1 = d_outputs.split1
        dy2 = d_outputs.split2

        # Computes the reverse.
        z1 = y1
        py_utils.ResetStepSeed(g_seed)
        gz1 = self.g_block.FProp(theta.g_block, z1, *extra_inputs)
        x2 = y2 - gz1
        py_utils.ResetStepSeed(f_seed)
        fx2 = self.f_block.FProp(theta.f_block, x2, *extra_inputs)
        x1 = z1 - fx2

        # Computes the gradients.
        dz1 = dy1 + tf.gradients(gz1, z1, dy2)[0]
        dx2 = dy2 + tf.gradients(fx2, x2, dz1)[0]

        dgw = tf.gradients(gz1,
                           theta.g_block.Flatten(),
                           dy2,
                           unconnected_gradients=tf.UnconnectedGradients.ZERO)
        dgw = theta.g_block.Pack(dgw)

        dfw = tf.gradients(fx2,
                           theta.f_block.Flatten(),
                           dz1,
                           unconnected_gradients=tf.UnconnectedGradients.ZERO)
        dfw = theta.f_block.Pack(dfw)

        return (py_utils.NestedMap(split1=x1, split2=x2),
                py_utils.NestedMap(split1=dz1, split2=dx2),
                py_utils.NestedMap(f_block=dfw,
                                   g_block=dgw,
                                   global_step=tf.zeros_like(
                                       theta.global_step)))