Beispiel #1
0
def fwd_gradients(ys, xs, grad_xs=None, stop_gradients=None):
  """Compute forward-mode gradients."""
  # See b/37888268.

  # This version of forward-mode autodiff is based on code by Tim Cooijmans
  # and handles list arguments and certain special cases such as when the
  # ys doesn't depend on one or more of the xs, and when ops.IndexedSlices are
  # generated by the first gradients_impl.gradients call.

  us = [array_ops.zeros_like(y) + float("nan") for y in ys]
  dydxs = gradients_impl.gradients(
      ys, xs, grad_ys=us, stop_gradients=stop_gradients)

  # Deal with strange types that gradients_impl.gradients returns but can't
  # deal with.
  dydxs = [
      ops.convert_to_tensor(dydx)
      if isinstance(dydx, ops.IndexedSlices) else dydx for dydx in dydxs
  ]
  dydxs = [
      array_ops.zeros_like(x) if dydx is None else dydx
      for x, dydx in zip(xs, dydxs)
  ]

  dysdx = gradients_impl.gradients(dydxs, us, grad_ys=grad_xs)

  return dysdx
  def testWithIsRecomputeKwarg(self):

    kwarg_values = []

    @rev_block_lib.recompute_grad
    def layer_with_recompute(inputs, is_recomputing=False):
      kwarg_values.append(is_recomputing)
      out = core_layers.dense(inputs, 2)
      out = normalization_layers.batch_normalization(out, training=True)
      if is_recomputing:
        # Ensure that the updates are not duplicated by popping off the latest
        # 2 additions.
        update_ops = ops.get_collection_ref(ops.GraphKeys.UPDATE_OPS)
        update_ops.pop()
        update_ops.pop()
      return out

    x = array_ops.ones((2, 4), dtypes.float32)
    with variable_scope.variable_scope("layer1", use_resource=True):
      y = layer_with_recompute(x)
    loss = math_ops.reduce_sum(y)
    tvars = variables.trainable_variables()
    gradients_impl.gradients(loss, [x] + tvars)

    update_ops = ops.get_collection(ops.GraphKeys.UPDATE_OPS)
    self.assertEqual(2, len(update_ops))
    self.assertEqual([False, True], kwarg_values)
      def fn():
        ta = tensor_array_ops.TensorArray(
            dtype=dtypes.as_dtype(dtype),
            tensor_array_name="foo",
            size=3,
            infer_shape=False)

        value_0 = constant_op.constant(c([[4.0, 5.0]]))
        value_1 = constant_op.constant(c([[3.0, 3.5]]))

        w0 = ta.write(0, value_0)
        w1 = w0.write(1, value_1)
        r0 = w1.read(0)
        r1 = w1.read(1)
        r0_2 = w1.read(0)

        # Test individual components' gradients
        grad_just_r0 = gradients_impl.gradients(
            ys=[r0], xs=[value_0], grad_ys=[c([[2.0, 3.0]])])
        grad_r0_r0_2 = gradients_impl.gradients(
            ys=[r0, r0_2],
            xs=[value_0],
            grad_ys=[c([[2.0, 3.0]]), c([[1.0, -1.0]])])
        grad_just_r1 = gradients_impl.gradients(
            ys=[r1], xs=[value_1], grad_ys=[c([[-2.0, -4.0]])])
        # Test combined gradients
        grad = gradients_impl.gradients(
            ys=[r0, r0_2, r1],
            xs=[value_0, value_1],
            grad_ys=[c([[2.0, 3.0]]),
                     c([[1.0, -1.0]]),
                     c([[-2.0, -10.0]])])

        return [grad_just_r0, grad_r0_r0_2, grad_just_r1, grad]
  def __getitem__(self, spec):
    slice_var = self.var[spec]
    slice_val = self.val[spec]

    # compute analytic 2nd derivative
    analytic_grad2 = 2 * slice_val

    dy = variables.Variable(
        array_ops.ones(
            shape=slice_var.get_shape(), dtype=dtypes.int32))
    assign = dy.assign(slice_var)
    slice_val_grad, = gradients_impl.gradients(slice_val, self.var, grad_ys=dy)
    slice_val_grad2, = gradients_impl.gradients(
        slice_val_grad, dy, grad_ys=self.var)
    self.sess.run(assign)
    slice_val_grad_evaled, slice_val_grad2_evaled = (
        self.sess.run([slice_val_grad, slice_val_grad2]))
    analytic_grad2_evaled = analytic_grad2.eval()
    self.test.assertAllEqual(slice_val_grad2_evaled, analytic_grad2_evaled)

    # compute analytic gradient for slice
    np_val_grad = (2 * self.varnp * self.varnp)
    np_sliceval_grad = np.zeros(self.var.get_shape())
    np_sliceval_grad[spec] = np_val_grad[spec]
    # verify gradient
    self.test.assertAllEqual(slice_val_grad_evaled, np_sliceval_grad)
  def testEntropyGradient(self):
    with self.cached_session() as sess:
      logits = constant_op.constant([[1., 2., 3.], [2., 5., 1.]])

      probabilities = nn_ops.softmax(logits)
      log_probabilities = nn_ops.log_softmax(logits)
      true_entropy = - math_ops.reduce_sum(
          probabilities * log_probabilities, axis=-1)

      categorical_distribution = categorical.Categorical(probs=probabilities)
      categorical_entropy = categorical_distribution.entropy()

      # works
      true_entropy_g = gradients_impl.gradients(true_entropy, [logits])
      categorical_entropy_g = gradients_impl.gradients(
          categorical_entropy, [logits])

      res = sess.run({"true_entropy": true_entropy,
                      "categorical_entropy": categorical_entropy,
                      "true_entropy_g": true_entropy_g,
                      "categorical_entropy_g": categorical_entropy_g})
      self.assertAllClose(res["true_entropy"],
                          res["categorical_entropy"])
      self.assertAllClose(res["true_entropy_g"],
                          res["categorical_entropy_g"])
  def testReduction(self):
    g = ops.Graph()

    # BN0 is computing batch normed matrix along rows.
    def BN0(x):
      mean = math_ops.reduce_mean(x, [0])
      var = math_ops.reduce_mean(math_ops.square(x - mean))  # biased var
      rstd = math_ops.rsqrt(var + 1e-8)
      return (x - mean) * rstd

    # Wraps BatchNorm in a tf function.
    @function.Defun(dtypes.float32)
    def BN1(x):
      return BN0(x)

    with g.as_default():
      x = array_ops.placeholder(dtypes.float32)
      y0 = BN0(x)  # A plain graph
      y1 = BN1(x)  # A tf function
      dx0, = gradients_impl.gradients([y0], [x])
      dx1, = gradients_impl.gradients([y1], [x])

    # Both should produce the same result and gradient.
    with self.test_session(graph=g) as sess:
      vals = sess.run([y0, y1, dx0, dx1], {x: np.random.uniform(size=(3, 7))})
      self.assertAllClose(vals[0], vals[1])
      self.assertAllClose(vals[2], vals[3])
  def _RunAndVerifyBackprop(self, input_sizes, filter_sizes, output_sizes,
                            strides, dilations, padding, data_format, use_gpu,
                            err, mode):
    total_input_size = 1
    total_filter_size = 1
    for s in input_sizes:
      total_input_size *= s
    for s in filter_sizes:
      total_filter_size *= s
    # Initializes the input tensor with array containing incrementing
    # numbers from 1.
    x1 = [f * 1.0 for f in range(1, total_input_size + 1)]
    x2 = [f * 1.0 for f in range(1, total_filter_size + 1)]
    default_dilations = (
        dilations[0] == 1 and dilations[1] == 1 and dilations[2] == 1)

    # If any dilation rate is larger than 1, only do test on the GPU
    # because we currently do not have a CPU implementation for arbitrary
    # dilation rates.
    if default_dilations or use_gpu:
      with self.cached_session(use_gpu=use_gpu) as sess:
        if data_format == "NCDHW":
          input_sizes = test_util.NHWCToNCHW(input_sizes)
        t1 = constant_op.constant(x1, shape=input_sizes)
        t2 = constant_op.constant(x2, shape=filter_sizes)
        full_strides = [1] + strides + [1]
        full_dilations = [1] + dilations + [1]
        if data_format == "NCDHW":
          full_strides = test_util.NHWCToNCHW(full_strides)
          full_dilations = test_util.NHWCToNCHW(full_dilations)
        actual = nn_ops.conv3d(
            t1,
            t2,
            strides=full_strides,
            dilations=full_dilations,
            padding=padding,
            data_format=data_format)
        expected = nn_ops.convolution(
            t1,
            t2,
            padding=padding,
            strides=strides,
            dilation_rate=dilations,
            data_format=data_format)
        if data_format == "NCDHW":
          actual = test_util.NCHWToNHWC(actual)
          expected = test_util.NCHWToNHWC(expected)
        actual_grad = gradients_impl.gradients(actual, t1
                                               if mode == "input" else t2)[0]
        expected_grad = gradients_impl.gradients(expected, t1
                                                 if mode == "input" else t2)[0]
        # "values" consists of two tensors for two backprops
        actual_value = self.evaluate(actual_grad)
        expected_value = self.evaluate(expected_grad)
        self.assertShapeEqual(actual_value, actual_grad)
        self.assertShapeEqual(expected_value, expected_grad)
      print("expected = ", expected_value)
      print("actual = ", actual_value)
      self.assertArrayNear(expected_value.flatten(), actual_value.flatten(),
                           err)
  def doTestIndexedSlicesGradientInCondInWhileLoop(self, use_resource=False):
    with ops.Graph().as_default():
      embedding_matrix = variable_scope.get_variable(
          "embedding_matrix", [5, 5],
          initializer=init_ops.random_normal_initializer(),
          use_resource=use_resource)

      def Cond(it, _):
        return it < 5

      def Body(it, cost):
        embedding = embedding_ops.embedding_lookup(embedding_matrix, [0])
        cost = control_flow_ops.cond(
            math_ops.equal(it, 3), lambda: math_ops.square(cost),
            lambda: cost + math_ops.reduce_sum(embedding))
        return it + 1, cost

      _, cost = control_flow_ops.while_loop(
          Cond, Body, [constant_op.constant(0), constant_op.constant(0.0)])

      dynamic_grads = gradients_impl.gradients(cost, [embedding_matrix])[0]
      dynamic_grads = math_ops.segment_sum(dynamic_grads.values,
                                           dynamic_grads.indices)

      embedding = embedding_ops.embedding_lookup(embedding_matrix, [0])
      static = math_ops.square(
          math_ops.reduce_sum(embedding) + math_ops.reduce_sum(embedding) +
          math_ops.reduce_sum(embedding)) + math_ops.reduce_sum(embedding)
      static_grads = gradients_impl.gradients(static, [embedding_matrix])[0]
      static_grads = math_ops.segment_sum(static_grads.values,
                                          static_grads.indices)

      with self.test_session() as sess:
        sess.run(variables.global_variables_initializer())
        self.assertAllEqual(*sess.run([static_grads, dynamic_grads]))
  def testShapePassedToGradient(self):
    with ops.Graph().as_default():
      @custom_gradient.custom_gradient
      def differentiable_scatter_update(handle, indices, values):
        with ops.control_dependencies([
            resource_variable_ops.resource_scatter_update(
                handle, indices, values)]):
          new_handle = array_ops.identity(handle)

        def grad(dresult):
          self.assertIsNotNone(
              tensor_util.constant_value(dresult.dense_shape))
          return [dresult, None, None]

        return new_handle, grad

      var = variable_scope.get_variable(
          "foo", shape=[20], initializer=init_ops.zeros_initializer,
          dtype=dtypes.float64, use_resource=True)

      indices = math_ops.range(10)
      updates = math_ops.range(9, -1, -1, dtype=dtypes.float64)
      new_handle = differentiable_scatter_update(var.handle, indices, updates)
      gathered = resource_variable_ops.resource_gather(
          new_handle, indices, dtype=var.dtype)
      gradients_impl.gradients([gathered], [updates])
def _create_multi_lstm_cell_ops(batch_size, num_units, input_depth,
                                num_layers, max_time, compiled):
  with variable_scope.variable_scope(
      "root",
      initializer=init_ops.random_uniform_initializer(-0.1, 0.1, seed=2)):
    inputs = variable_scope.get_variable(
        "inputs", initializer=random_ops.random_uniform(
            (max_time, batch_size, input_depth), seed=1))
    maybe_xla = lambda c: rnn_cell.CompiledWrapper(c) if compiled else c
    cell = core_rnn_cell_impl.MultiRNNCell(
        [maybe_xla(core_rnn_cell_impl.LSTMCell(num_units))
         for _ in range(num_layers)])
    initial_state = cell.zero_state(
        batch_size=batch_size, dtype=dtypes.float32)
    outputs, final_state = rnn.dynamic_rnn(
        cell=cell, inputs=inputs, initial_state=initial_state,
        time_major=True)
    flat_final_state = nest.flatten(final_state)
    trainable_variables = variables.trainable_variables()
    outputs_grad = gradients_impl.gradients(
        [outputs],
        trainable_variables + [inputs] + nest.flatten(initial_state))
    final_state_grad = gradients_impl.gradients(
        flat_final_state,
        trainable_variables + [inputs] + nest.flatten(initial_state))

    return {"outputs": outputs,
            "final_state": flat_final_state,
            "outputs_grad": outputs_grad,
            "final_state_grad": final_state_grad}
    def body(it, cost):
      embedding = embedding_ops.embedding_lookup(embedding_matrix, [0])
      cost = control_flow_ops.cond(
          math_ops.equal(it, 3), lambda: math_ops.square(cost),
          (lambda: cost + math_ops.reduce_sum(embedding)))
      return it + 1, cost

      _, cost = control_flow_ops.while_loop(
          cond, body, [constant_op.constant(0),
                       constant_op.constant(0.0)])

      dynamic_grads = gradients_impl.gradients(cost, [embedding_matrix])[0]
      dynamic_grads = math_ops.segment_sum(dynamic_grads.values,
                                           dynamic_grads.indices)

      embedding = embedding_ops.embedding_lookup(embedding_matrix, [0])
      static = math_ops.square(
          math_ops.reduce_sum(embedding) + math_ops.reduce_sum(embedding) +
          math_ops.reduce_sum(embedding)) + math_ops.reduce_sum(embedding)
      static_grads = gradients_impl.gradients(static, [embedding_matrix])[0]
      static_grads = math_ops.segment_sum(static_grads.values,
                                          static_grads.indices)

      with self.cached_session():
        self.evaluate(variables.global_variables_initializer())
        self.assertAllEqual(*self.evaluate([static_grads, dynamic_grads]))
Beispiel #12
0
  def testNanFromGradsDontPropagate(self):
    """Test that update with NaN gradients does not cause NaN in results."""
    def _nan_log_prob_with_nan_gradient(x):
      return np.nan * math_ops.reduce_sum(x)

    with self.test_session() as sess:
      initial_x = math_ops.linspace(0.01, 5, 10)
      updated_x, acceptance_probs, new_log_prob, new_grad = hmc.kernel(
          2., 5, initial_x, _nan_log_prob_with_nan_gradient, [0])
      initial_x_val, updated_x_val, acceptance_probs_val = sess.run(
          [initial_x, updated_x, acceptance_probs])

      logging.vlog(1, 'initial_x = {}'.format(initial_x_val))
      logging.vlog(1, 'updated_x = {}'.format(updated_x_val))
      logging.vlog(1, 'acceptance_probs = {}'.format(acceptance_probs_val))

      self.assertAllEqual(initial_x_val, updated_x_val)
      self.assertEqual(acceptance_probs_val, 0.)

      self.assertAllFinite(
          gradients_impl.gradients(updated_x, initial_x)[0].eval())
      self.assertTrue(
          gradients_impl.gradients(new_grad, initial_x)[0] is None)

      # Gradients of the acceptance probs and new log prob are not finite.
      _ = new_log_prob  # Prevent unused arg error.
Beispiel #13
0
  def _testCond(self, true_fn, false_fn, train_vals, feed_dict=None):
    if not feed_dict:
      feed_dict = {}
    with self.test_session(graph=ops.get_default_graph()) as sess:
      pred = array_ops.placeholder(dtypes.bool, name="pred")

      expected = control_flow_ops.cond(pred, true_fn, false_fn, name="expected")
      actual = cond_v2.cond_v2(pred, true_fn, false_fn, name="actual")

      expected_grad = gradients_impl.gradients(expected, train_vals)
      actual_grad = gradients_impl.gradients(actual, train_vals)

      sess_run_args = {pred: True}
      sess_run_args.update(feed_dict)
      expected_val, actual_val, expected_grad_val, actual_grad_val = sess.run(
          (expected, actual, expected_grad, actual_grad), sess_run_args)
      self.assertEqual(expected_val, actual_val)
      self.assertEqual(expected_grad_val, actual_grad_val)

      sess_run_args = {pred: False}
      sess_run_args.update(feed_dict)
      expected_val, actual_val, expected_grad_val, actual_grad_val = sess.run(
          (expected, actual, expected_grad, actual_grad), sess_run_args)
      self.assertEqual(expected_val, actual_val)
      self.assertEqual(expected_grad_val, actual_grad_val)
Beispiel #14
0
  def testNanFromGradsDontPropagate(self):
    """Test that update with NaN gradients does not cause NaN in results."""
    def _nan_log_prob_with_nan_gradient(x):
      return np.nan * math_ops.reduce_sum(x)

    with self.test_session() as sess:
      initial_x = math_ops.linspace(0.01, 5, 10)
      updated_x, kernel_results = hmc.kernel(
          target_log_prob_fn=_nan_log_prob_with_nan_gradient,
          current_state=initial_x,
          step_size=2.,
          num_leapfrog_steps=5,
          seed=47)
      initial_x_, updated_x_, acceptance_probs_ = sess.run(
          [initial_x, updated_x, kernel_results.acceptance_probs])

      logging_ops.vlog(1, "initial_x = {}".format(initial_x_))
      logging_ops.vlog(1, "updated_x = {}".format(updated_x_))
      logging_ops.vlog(1, "acceptance_probs = {}".format(acceptance_probs_))

      self.assertAllEqual(initial_x_, updated_x_)
      self.assertEqual(acceptance_probs_, 0.)

      self.assertAllFinite(
          gradients_ops.gradients(updated_x, initial_x)[0].eval())
      self.assertAllEqual([True], [g is None for g in gradients_ops.gradients(
          kernel_results.proposed_grads_target_log_prob, initial_x)])
      self.assertAllEqual([False], [g is None for g in gradients_ops.gradients(
          kernel_results.proposed_grads_target_log_prob,
          kernel_results.proposed_state)])
Beispiel #15
0
  def testSecondDerivative(self):
    with self.test_session() as sess:
      pred = array_ops.placeholder(dtypes.bool, name="pred")
      x = constant_op.constant(3.0, name="x")

      def true_fn():
        return math_ops.pow(x, 3)

      def false_fn():
        return x

      cond = cond_v2.cond_v2(pred, true_fn, false_fn, name="cond")
      cond_grad = gradients_impl.gradients(cond, [x])
      cond_grad_grad = gradients_impl.gradients(cond_grad, [x])

      # d[x^3]/dx = 3x^2
      true_val = sess.run(cond_grad, {pred: True})
      self.assertEqual(true_val, [27.0])
      # d[x]/dx = 1
      false_val = sess.run(cond_grad, {pred: False})
      self.assertEqual(false_val, [1.0])

      true_val = sess.run(cond_grad_grad, {pred: True})
      # d2[x^3]/dx2 = 6x
      self.assertEqual(true_val, [18.0])
      false_val = sess.run(cond_grad_grad, {pred: False})
      # d2[x]/dx2 = 0
      self.assertEqual(false_val, [0.0])
  def testSumOfTwoReadVariablesWithoutRepeatGrad(self):
    with self.test_session(use_gpu=True) as session:
      a = array_ops.identity(
          np.arange(
              3 * 5, dtype=np.float32).reshape(3, 5) + 1)
      b = array_ops.identity(
          np.arange(
              3 * 5, dtype=np.float32).reshape(3, 5) + 1 + 3 * 5)
      ta = tensor_array_ops.TensorArray(dtype=dtypes.float32, size=2)
      ta = ta.write(0, a, name="write_a")
      ta = ta.write(1, b, name="write_b")
      c = (
          ta.read(
              0, name="read_a_0") +  # a + b
          ta.read(
              1, name="read_b_0"))
      g0 = -(np.arange(3 * 5, dtype=np.float32).reshape(3, 5) + 1)
      grad_a = gradients_impl.gradients([c], [a], [g0])[0]  # d(a+b)/da = 1
      grad_b = gradients_impl.gradients([c], [b], [g0])[0]  # d(a+b)/db = 1

      # Test gradients calculated individually
      grad_a_t, = session.run([grad_a])
      self.assertAllEqual(grad_a_t, g0)

      grad_b_t, = session.run([grad_b])
      self.assertAllEqual(grad_b_t, g0)

      # Test gradients calculated jointly
      joint_grad_a_t, joint_grad_b_t = session.run([grad_a, grad_b])
      self.assertAllEqual(joint_grad_a_t, g0)
      self.assertAllEqual(joint_grad_b_t, g0)
Beispiel #17
0
  def testGradientFloat16(self):
    with self.test_session(use_gpu=True) as sess:
      # Randomly construct a 1D shape from [1, 40)
      shape = random_ops.random_uniform(
          [1], minval=1, maxval=40, dtype=dtypes.int32)

      # Construct the fp32 graph and its gradient.
      x = random_ops.random_uniform(shape, minval=-1, maxval=1, name="x")
      y1 = nn_ops.relu(x, name="relu_fp32")
      l1 = nn_ops.l2_loss(y1)
      dx_f32 = gradients_impl.gradients(l1, x)

      # Construct the fp16 graph and its gradient.
      # It starts with the same x, in fp32. But before it reaches Relu, it is
      # cast into fp16. So during backprop, the gradient computation is in fp16.
      x2 = math_ops.cast(x, dtype=dtypes.float16, name="cast")
      y2 = nn_ops.relu(x2, name="relu_fp16")
      l2 = nn_ops.l2_loss(y2)
      dx_f16 = gradients_impl.gradients(l2, x)

      # Repeat the experiment for 100 times. All tensor shapes and its tensor
      # values are randomly generated for each run.
      for _ in xrange(100):
        dx_f32_v, dx_f16_v = sess.run([dx_f32, dx_f16])
        self.assertAllClose(dx_f32_v, dx_f16_v, atol=3e-4)
 def testGradientThroughSingleBranchOutsideOfContext(self):
   x = constant_op.constant(2.)
   s = constant_op.constant(True)
   x_false, x_true = control_flow_ops.switch(x, s)
   grad_x_true = gradients_impl.gradients(x_true, x)[0]
   grad_x_false = gradients_impl.gradients(x_false, x)[0]
   self.assertEquals(self.evaluate(grad_x_true), 1.)
   self.assertEquals(self.evaluate(grad_x_false), 0.)
 def grad_fn(inputs, trainable_variables, outputs, grad_outputs):
   outputs = outputs[0]
   grad_outputs = grad_outputs[0]
   grad_inputs = gradients_impl.gradients(
       outputs, inputs, grad_ys=grad_outputs)
   grad_vars = gradients_impl.gradients(
       outputs, trainable_variables, grad_ys=grad_outputs)
   return grad_inputs, grad_vars
Beispiel #20
0
 def testDoubleDerivative(self):
   x = constant_op.constant(2.)
   ret = while_loop_v2(lambda v: v < 8., lambda v: v**2, [x])  # x**4
   grad = gradients_impl.gradients(ret, [x])  # 4x**3
   grad_grad = gradients_impl.gradients(grad, [x])  # 12x**2
   with self.cached_session() as sess:
     self.assertEqual(sess.run(ret), 16.)
     self.assertSequenceEqual(sess.run(grad), [32.])
     self.assertSequenceEqual(sess.run(grad_grad), [48.])
Beispiel #21
0
 def testMultipleWhileLoops(self):
   x = constant_op.constant(2.)
   ret1 = while_loop_v2(lambda v: v < 4., lambda v: v * v, [x])  # x**2
   ret2 = while_loop_v2(lambda v: v < 16., lambda v: v * v, ret1)  # x**4
   grad = gradients_impl.gradients(ret2, [x])  # 4x**3
   grad_grad = gradients_impl.gradients(grad, [x])  # 12x**2
   with self.cached_session() as sess:
     self.assertSequenceEqual(sess.run(grad), [32.])
     self.assertSequenceEqual(sess.run(grad_grad), [48.])
Beispiel #22
0
  def _testGradient(self, np_input, bias, dtype, data_format, use_gpu):
    with self.test_session(use_gpu=use_gpu):
      if data_format == "NCHW":
        np_input = self._NHWCToNCHW(np_input)
      input_tensor = constant_op.constant(
          np_input, shape=np_input.shape, dtype=dtype)
      bias_tensor = constant_op.constant(bias, shape=bias.shape, dtype=dtype)
      output_tensor = nn_ops.bias_add(
          input_tensor, bias_tensor, data_format=data_format)
      tensor_jacob_t, tensor_jacob_n = gradient_checker.compute_gradient(
          input_tensor, np_input.shape, output_tensor, np_input.shape)
      bias_jacob_t, bias_jacob_n = gradient_checker.compute_gradient(
          bias_tensor, bias.shape, output_tensor, np_input.shape)

      # Test gradient of BiasAddGrad
      bias_add_grad = gradients_impl.gradients(
          nn_ops.l2_loss(output_tensor), bias_tensor)[0]
      grad_jacob_t, grad_jacob_n = gradient_checker.compute_gradient(
          output_tensor, np_input.shape, bias_add_grad, bias.shape)

      if dtype == np.float16:
        # Compare fp16 theoretical gradients to fp32 numerical gradients,
        # since fp16 numerical gradients are too imprecise unless great
        # care is taken with choosing the inputs and the delta. This is
        # a weaker check (in particular, it does not test the op itself,
        # only its gradient), but it's much better than nothing.
        input_tensor = constant_op.constant(
            np_input, shape=np_input.shape, dtype=np.float32)
        bias_tensor = constant_op.constant(
            bias, shape=bias.shape, dtype=np.float32)
        output_tensor = nn_ops.bias_add(
            input_tensor, bias_tensor, data_format=data_format)
        _, tensor_jacob_n = gradient_checker.compute_gradient(input_tensor,
                                                              np_input.shape,
                                                              output_tensor,
                                                              np_input.shape)
        _, bias_jacob_n = gradient_checker.compute_gradient(bias_tensor,
                                                            bias.shape,
                                                            output_tensor,
                                                            np_input.shape)

        bias_add_grad = gradients_impl.gradients(
            nn_ops.l2_loss(output_tensor), bias_tensor)[0]
        _, grad_jacob_n = gradient_checker.compute_gradient(output_tensor,
                                                            np_input.shape,
                                                            bias_add_grad,
                                                            bias.shape)

      threshold = 2e-3
      if dtype == dtypes.float64:
        threshold = 1e-10
      self.assertAllClose(tensor_jacob_t, tensor_jacob_n, threshold, threshold)
      # TODO(annarev): Re-add assertion for float16, float32 dtypes and NCHW
      # once we figure out why this check started failing with cuda mavx.
      if dtype == dtypes.float64 or data_format != "NCHW":
        self.assertAllClose(bias_jacob_t, bias_jacob_n, threshold, threshold)
        self.assertAllClose(grad_jacob_t, grad_jacob_n, threshold, threshold)
Beispiel #23
0
 def testNoGradients(self):
   component = constant_op.constant([1.])
   side = constant_op.constant(0.)
   add = lambda x: x + side
   dataset = dataset_ops.Dataset.from_tensor_slices(component).map(add)
   value = dataset_ops.make_one_shot_iterator(dataset).get_next()
   self.assertIsNone(gradients_impl.gradients(value, component)[0])
   self.assertIsNone(gradients_impl.gradients(value, side)[0])
   self.assertIsNone(gradients_impl.gradients(value, [component, side])[0])
  def testTimeReversedFusedRNN(self):
    with self.test_session() as sess:
      initializer = init_ops.random_uniform_initializer(
          -0.01, 0.01, seed=19890213)
      fw_cell = core_rnn_cell_impl.BasicRNNCell(10)
      bw_cell = core_rnn_cell_impl.BasicRNNCell(10)
      batch_size = 5
      input_size = 20
      timelen = 15
      inputs = constant_op.constant(
          np.random.randn(timelen, batch_size, input_size))

      # test bi-directional rnn
      with variable_scope.variable_scope("basic", initializer=initializer):
        unpacked_inputs = array_ops.unstack(inputs)
        outputs, fw_state, bw_state = core_rnn.static_bidirectional_rnn(
            fw_cell, bw_cell, unpacked_inputs, dtype=dtypes.float64)
        packed_outputs = array_ops.stack(outputs)
        basic_vars = [
            v for v in variables.trainable_variables()
            if v.name.startswith("basic/")
        ]
        sess.run([variables.global_variables_initializer()])
        basic_outputs, basic_fw_state, basic_bw_state = sess.run(
            [packed_outputs, fw_state, bw_state])
        basic_grads = sess.run(gradients_impl.gradients(packed_outputs, inputs))
        basic_wgrads = sess.run(
            gradients_impl.gradients(packed_outputs, basic_vars))

      with variable_scope.variable_scope("fused", initializer=initializer):
        fused_cell = fused_rnn_cell.FusedRNNCellAdaptor(
            core_rnn_cell_impl.BasicRNNCell(10))
        fused_bw_cell = fused_rnn_cell.TimeReversedFusedRNN(
            fused_rnn_cell.FusedRNNCellAdaptor(
                core_rnn_cell_impl.BasicRNNCell(10)))
        fw_outputs, fw_state = fused_cell(
            inputs, dtype=dtypes.float64, scope="fw")
        bw_outputs, bw_state = fused_bw_cell(
            inputs, dtype=dtypes.float64, scope="bw")
        outputs = array_ops.concat([fw_outputs, bw_outputs], 2)
        fused_vars = [
            v for v in variables.trainable_variables()
            if v.name.startswith("fused/")
        ]
        sess.run([variables.global_variables_initializer()])
        fused_outputs, fused_fw_state, fused_bw_state = sess.run(
            [outputs, fw_state, bw_state])
        fused_grads = sess.run(gradients_impl.gradients(outputs, inputs))
        fused_wgrads = sess.run(gradients_impl.gradients(outputs, fused_vars))

      self.assertAllClose(basic_outputs, fused_outputs)
      self.assertAllClose(basic_fw_state, fused_fw_state)
      self.assertAllClose(basic_bw_state, fused_bw_state)
      self.assertAllClose(basic_grads, fused_grads)
      for basic, fused in zip(basic_wgrads, fused_wgrads):
        self.assertAllClose(basic, fused, rtol=1e-2, atol=1e-2)
  def testFromLibrary(self):
    # Define some functions with different gradient functions. Note that many of
    # the below functions are identical since function bodies don't matter for
    # this test.

    @function.Defun(dtypes.float32, dtypes.float32)
    def G1(x, dy):
      return x * dy

    @function.Defun(dtypes.float32, dtypes.float32)
    def G2(x, dy):
      return x * dy

    # F1 and F2 have the same gradient function
    @function.Defun(dtypes.float32, grad_func=G1)
    def F1(x):
      return math_ops.exp(x) - math_ops.exp(-x)

    @function.Defun(dtypes.float32, grad_func=G1)
    def F2(x):
      return math_ops.exp(x) - math_ops.exp(-x)

    # F3 has a different gradient function
    @function.Defun(dtypes.float32, grad_func=G2)
    def F3(x):
      return math_ops.exp(x) - math_ops.exp(-x)

    # F4 has no gradient function
    @function.Defun(dtypes.float32)
    def F4(x):
      return math_ops.exp(x) - math_ops.exp(-x)

    # Instantiate all functions
    g = ops.Graph()
    with g.as_default():
      c = constant_op.constant(1.0, dtypes.float32)
      f1 = F1(c)
      f2 = F2(c)
      f3 = F3(c)
      f4 = F4(c)
      gradients_impl.gradients([f1, f2, f3, f4], c)

    library = g.as_graph_def().library
    new_funcs = function._from_library(library)

    def CheckNewFunc(func):
      new_func = [f for f in new_funcs if f.name == func.name]
      self.assertEqual(len(new_func), 1)
      self.expectFunctionsEqual(func, new_func=new_func[0])

    CheckNewFunc(G1)
    CheckNewFunc(G2)
    CheckNewFunc(F1)
    CheckNewFunc(F2)
    CheckNewFunc(F3)
    CheckNewFunc(F4)
Beispiel #26
0
  def testGradGrad(self):
    with self.test_session():
      x = array_ops.placeholder(dtype=dtypes.float32)
      elu = nn_ops.elu(x)
      g, = gradients_impl.gradients(elu, x)
      gg, = gradients_impl.gradients(g, x)

      for x_val in [-1, -0.5, 0.5, 1]:
        err = np.abs(gg.eval(feed_dict={x: x_val}) - _elu_grad_grad(x_val))
        self.assertLess(err, 1e-4)
 def testMap_Grad(self):
   with self.cached_session():
     param = constant_op.constant(2.0)
     elems = constant_op.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], name="elems")
     y = functional_ops.map_fn(
         lambda x: math_ops.multiply(math_ops.square(x), param), elems)
     r = gradients_impl.gradients(y, param)[0]
     self.assertAllEqual(91.0, self.evaluate(r))
     r = gradients_impl.gradients(y, elems)[0]
     self.assertAllEqual([4.0, 8.0, 12.0, 16.0, 20.0, 24.0], self.evaluate(r))
  def testCtcLossDenseWithBlankIndexIsSameAsCtcLoss(self):
    random_seed.set_random_seed(5)

    batch_size = 8
    num_labels = 6
    label_length = 5
    num_frames = 12
    logits = random_ops.random_uniform([num_frames, batch_size, num_labels])
    labels = random_ops.random_uniform(
        [batch_size, label_length], minval=0, maxval=num_labels-1,
        dtype=dtypes.int64)

    label_lengths = random_ops.random_uniform(
        [batch_size], minval=2, maxval=label_length, dtype=dtypes.int64)
    label_mask = array_ops.sequence_mask(
        label_lengths, maxlen=label_length, dtype=label_lengths.dtype)
    labels *= label_mask

    logit_lengths = [num_frames] * batch_size

    tf_ctc_loss_labels = math_ops.cast(labels, dtypes.int32)
    tf_ctc_loss_labels = ctc_ops.dense_labels_to_sparse(
        tf_ctc_loss_labels, label_lengths)

    tf_nn_ctc_loss = ctc_ops.ctc_loss(
        labels=tf_ctc_loss_labels,
        inputs=logits,
        sequence_length=logit_lengths,
        time_major=True)
    tf_nn_ctc_grads = gradients_impl.gradients(tf_nn_ctc_loss, [logits])[0]

    # Shift the blank logits/labels to be somewhere in the middle.
    blank_index = 2
    shifted_logits = array_ops.concat([
        logits[:, :, :blank_index],
        logits[:, :, -1:],
        logits[:, :, blank_index:-1],
    ], axis=2)
    shifted_labels = array_ops.where(labels < blank_index, labels, labels + 1)

    ctc_loss = ctc_ops.ctc_loss_dense(
        labels=shifted_labels,
        logits=shifted_logits,
        label_length=label_lengths,
        logit_length=logit_lengths,
        blank_index=blank_index)
    ctc_loss_grads = gradients_impl.gradients(ctc_loss, [logits])[0]

    with self.cached_session() as sess:
      for _ in range(32):
        self.assertAllClose(*self.evaluate([ctc_loss, tf_nn_ctc_loss]))
        self.assertAllClose(
            *self.evaluate([ctc_loss_grads, tf_nn_ctc_grads]),
            rtol=2e-06,
            atol=2e-06)
 def testNoIntegerGradient6(self):
   k = constant_op.constant(3)
   x = math_ops.to_float(k)
   grad_1, = gradients_impl.gradients(k * k, k)
   grad_2, = gradients_impl.gradients(x * x, k)
   grad_3, = gradients_impl.gradients(math_ops.square(k), k)
   grad_4, = gradients_impl.gradients(math_ops.square(x), k)
   self.assertIsNone(grad_1)
   self.assertIsNone(grad_2)
   self.assertIsNone(grad_3)
   self.assertIsNone(grad_4)
 def testPrintGradient(self):
   with self.test_session():
     inp = constant_op.constant(2.0, shape=[100, 32], name="in")
     w = constant_op.constant(4.0, shape=[10, 100], name="w")
     wx = math_ops.matmul(w, inp, name="wx")
     wx_print = logging_ops.Print(wx, [w, w, w])
     wx_grad = gradients_impl.gradients(wx, w)[0]
     wx_print_grad = gradients_impl.gradients(wx_print, w)[0]
     wxg = wx_grad.eval()
     wxpg = wx_print_grad.eval()
   self.assertAllEqual(wxg, wxpg)
Beispiel #31
0
    def _testRevBlock(self,
                      x=None,
                      f=None,
                      g=None,
                      f_side_input=None,
                      g_side_input=None):
        random_seed.set_random_seed(1234)

        if f is None:

            def f(x):  # pylint: disable=function-redefined
                return core_layers.dense(x, self.CHANNELS // 2, use_bias=True)

        if g is None:

            def g(x):  # pylint: disable=function-redefined
                return core_layers.dense(x, self.CHANNELS // 2, use_bias=True)

        if f_side_input is None:
            f_side_input = []

        if g_side_input is None:
            g_side_input = []

        if x is None:
            x = random_ops.random_uniform([self.BATCH_SIZE, self.CHANNELS],
                                          dtype=dtypes.float32)
        x1, x2 = array_ops.split(x, 2, axis=-1)

        with variable_scope.variable_scope("rev_test") as vs:
            y1_rev, y2_rev = rev_block_lib.rev_block(
                x1,
                x2,
                f,
                g,
                f_side_input=f_side_input,
                g_side_input=g_side_input,
                num_layers=self.NUM_LAYERS)
            y_rev = array_ops.concat([y1_rev, y2_rev], axis=1)
            fg_vars = vs.trainable_variables()

        num_vars = len(variables.global_variables())
        with variable_scope.variable_scope(vs, reuse=True):
            y1, y2 = rev_block_lib.rev_block(x1,
                                             x2,
                                             f,
                                             g,
                                             f_side_input=f_side_input,
                                             g_side_input=g_side_input,
                                             num_layers=self.NUM_LAYERS,
                                             is_training=False)
            y = array_ops.concat([y1, y2], axis=1)
        # Ensure no new vars were created - full reuse
        assert len(variables.global_variables()) == num_vars

        loss_rev = math_ops.reduce_mean(y_rev + 10.)
        loss = math_ops.reduce_mean(y + 10.)

        wrt = [x] + f_side_input + g_side_input + fg_vars
        grads_rev = gradients_impl.gradients(loss_rev, wrt)
        grads = gradients_impl.gradients(loss, wrt)

        with self.test_session() as sess:
            sess.run(variables.global_variables_initializer())
            y_val, yd_val, gd_val, g_val = sess.run(
                [y, y_rev, grads_rev, grads])
            self.assertAllClose(y_val, yd_val)
            for g1, g2 in zip(gd_val, g_val):
                self.assertAllClose(g1, g2)
Beispiel #32
0
    def testBasicRNNFusedWrapper(self):
        """This test checks that using a wrapper for BasicRNN works as expected."""

        with self.cached_session() as sess:
            initializer = init_ops.random_uniform_initializer(-0.01,
                                                              0.01,
                                                              seed=19890212)
            cell = rnn_cell.BasicRNNCell(10)
            batch_size = 5
            input_size = 20
            timelen = 15
            inputs = constant_op.constant(
                np.random.randn(timelen, batch_size, input_size))
            with variable_scope.variable_scope("basic",
                                               initializer=initializer):
                unpacked_inputs = array_ops.unstack(inputs)
                outputs, state = rnn.static_rnn(cell,
                                                unpacked_inputs,
                                                dtype=dtypes.float64)
                packed_outputs = array_ops.stack(outputs)
                basic_vars = [
                    v for v in variables.trainable_variables()
                    if v.name.startswith("basic/")
                ]
                sess.run([variables.global_variables_initializer()])
                basic_outputs, basic_state = sess.run([packed_outputs, state])
                basic_grads = sess.run(
                    gradients_impl.gradients(packed_outputs, inputs))
                basic_wgrads = sess.run(
                    gradients_impl.gradients(packed_outputs, basic_vars))

            with variable_scope.variable_scope("fused_static",
                                               initializer=initializer):
                fused_cell = fused_rnn_cell.FusedRNNCellAdaptor(
                    rnn_cell.BasicRNNCell(10))
                outputs, state = fused_cell(inputs, dtype=dtypes.float64)
                fused_static_vars = [
                    v for v in variables.trainable_variables()
                    if v.name.startswith("fused_static/")
                ]
                sess.run([variables.global_variables_initializer()])
                fused_static_outputs, fused_static_state = sess.run(
                    [outputs, state])
                fused_static_grads = sess.run(
                    gradients_impl.gradients(outputs, inputs))
                fused_static_wgrads = sess.run(
                    gradients_impl.gradients(outputs, fused_static_vars))

            self.assertAllClose(basic_outputs, fused_static_outputs)
            self.assertAllClose(basic_state, fused_static_state)
            self.assertAllClose(basic_grads, fused_static_grads)
            for basic, fused in zip(basic_wgrads, fused_static_wgrads):
                self.assertAllClose(basic, fused, rtol=1e-2, atol=1e-2)

            with variable_scope.variable_scope("fused_dynamic",
                                               initializer=initializer):
                fused_cell = fused_rnn_cell.FusedRNNCellAdaptor(
                    rnn_cell.BasicRNNCell(10), use_dynamic_rnn=True)
                outputs, state = fused_cell(inputs, dtype=dtypes.float64)
                fused_dynamic_vars = [
                    v for v in variables.trainable_variables()
                    if v.name.startswith("fused_dynamic/")
                ]
                sess.run([variables.global_variables_initializer()])
                fused_dynamic_outputs, fused_dynamic_state = sess.run(
                    [outputs, state])
                fused_dynamic_grads = sess.run(
                    gradients_impl.gradients(outputs, inputs))
                fused_dynamic_wgrads = sess.run(
                    gradients_impl.gradients(outputs, fused_dynamic_vars))

            self.assertAllClose(basic_outputs, fused_dynamic_outputs)
            self.assertAllClose(basic_state, fused_dynamic_state)
            self.assertAllClose(basic_grads, fused_dynamic_grads)
            for basic, fused in zip(basic_wgrads, fused_dynamic_wgrads):
                self.assertAllClose(basic, fused, rtol=1e-2, atol=1e-2)
Beispiel #33
0
    def testDerivativeOfBlockGRUToGRUCellSingleStep(self):
        with self.test_session(use_gpu=True, graph=ops.Graph()) as sess:
            batch_size = 2
            cell_size = 3
            input_size = 4

            seed = 1994
            initializer = init_ops.random_uniform_initializer(-0.01,
                                                              0.01,
                                                              seed=seed)
            np.random.seed(seed)

            # Inputs
            x = array_ops.zeros([batch_size, input_size])
            h = array_ops.zeros([batch_size, cell_size])

            # Values for the inputs.
            x_value = np.random.rand(batch_size, input_size)
            h_value = np.random.rand(batch_size, cell_size)

            # Gradients from the block GRU cell implementation.
            with vs.variable_scope("block", initializer=initializer):
                output = gru_ops.GRUBlockCell(cell_size)(x, h)
                sess.run([variables.global_variables_initializer()])

                all_variables = variables.global_variables()[0:4]
                [w_ru, b_ru, w_c, b_c] = all_variables

                d_new_h_wrt_x = gradients_impl.gradients([output], x)
                d_new_h_wrt_h = gradients_impl.gradients([output], h)
                d_new_h_wrt_w_ru = gradients_impl.gradients([output], w_ru)
                d_new_h_wrt_w_c = gradients_impl.gradients([output], w_c)
                d_new_h_wrt_b_ru = gradients_impl.gradients([output], b_ru)
                d_new_h_wrt_b_c = gradients_impl.gradients([output], b_c)

                d_block_res = sess.run([
                    d_new_h_wrt_x, d_new_h_wrt_h, d_new_h_wrt_w_ru,
                    d_new_h_wrt_w_c, d_new_h_wrt_b_ru, d_new_h_wrt_b_c
                ], {
                    x: x_value,
                    h: h_value
                })

            # Gradients from the basic GRU cell implementation.
            with vs.variable_scope("basic", initializer=initializer):
                output = rnn_cell.GRUCell(cell_size)(x, h)
                sess.run([variables.global_variables_initializer()])

                all_variables = variables.global_variables()[4:8]
                [w_ru, b_ru, w_c, b_c] = all_variables

                d_new_h_wrt_x = gradients_impl.gradients([output], x)
                d_new_h_wrt_h = gradients_impl.gradients([output], h)
                d_new_h_wrt_w_ru = gradients_impl.gradients([output], w_ru)
                d_new_h_wrt_w_c = gradients_impl.gradients([output], w_c)
                d_new_h_wrt_b_ru = gradients_impl.gradients([output], b_ru)
                d_new_h_wrt_b_c = gradients_impl.gradients([output], b_c)

                d_basic_res = sess.run([
                    d_new_h_wrt_x, d_new_h_wrt_h, d_new_h_wrt_w_ru,
                    d_new_h_wrt_w_c, d_new_h_wrt_b_ru, d_new_h_wrt_b_c
                ], {
                    x: x_value,
                    h: h_value
                })

            # Check lengths of derivative results.
            self.assertEqual(len(d_block_res), len(d_basic_res))
            # Check the value of every derivative result.
            for block, basic in zip(d_block_res, d_basic_res):
                self.assertAllClose(block, basic)
Beispiel #34
0
    def testGradient(self,
                     params,
                     indices,
                     expected_out,
                     out_grad,
                     expected_grad,
                     params_ragged_rank=None):
        """Tests that ragged_gather generates the right gradient.

    Args:
      params: The `params` that should be passed to `gather`.
      indices: The `indices` that should be passed to `gather`.
      expected_out: The expected value of `gather(params, indices)`.
        `expected_out.shape = indices.shape + params.shape[1:]`.
      out_grad: The value that should be fed in as the gradient for `out`
        when testing the gradient of `ragged_gather`.  Must have the same
        shape as `expected_out`.
      expected_grad: The expected gradient for that should be returned for
        `params`.  Must have hte same shape as `params`.
      params_ragged_rank: The ragged_rank of `params`.
    """
        if context.executing_eagerly():
            return

        params = ragged_factory_ops.constant(params,
                                             dtype=dtypes.float32,
                                             ragged_rank=params_ragged_rank)
        indices = constant_op.constant(indices, dtype=dtypes.int32)
        out_ragged_rank = params.ragged_rank + indices.shape.ndims - 1
        out_grad = ragged_factory_ops.constant(out_grad,
                                               dtype=dtypes.float32,
                                               ragged_rank=out_ragged_rank)
        expected_out = ragged_factory_ops.constant(expected_out,
                                                   dtype=dtypes.float32,
                                                   ragged_rank=out_ragged_rank)
        expected_grad = ragged_factory_ops.constant(
            expected_grad,
            dtype=dtypes.float32,
            ragged_rank=params.ragged_rank)

        out = ragged_gather_ops.gather(params, indices)
        self.assertAllClose(out, expected_out)

        grads = gradients_impl.gradients(out.flat_values,
                                         (params.nested_row_splits + (
                                             params.flat_values,
                                             indices,
                                         )), out_grad.flat_values)
        param_nested_splits_grads = grads[:-2]
        params_flat_values_grad = grads[-2]
        indices_grad = grads[-1]
        self.assertEqual(indices_grad, None)
        for splits_grad in param_nested_splits_grads:
            self.assertEqual(splits_grad, None)

        # The gradient generates an IndexedSlices; convert back to a normal Tensor.
        self.assertIsInstance(params_flat_values_grad,
                              indexed_slices.IndexedSlices)
        params_flat_values_grad = ops.convert_to_tensor(
            params_flat_values_grad)

        params_grad = params.with_flat_values(params_flat_values_grad)
        self.assertAllClose(params_grad, expected_grad, atol=2e-6, rtol=2e-6)
Beispiel #35
0
def RunGRU(sess,
           num_units,
           input_size,
           batch_size,
           time,
           num_layers=1,
           is_training=True,
           dropout=0.,
           num_dirs=True,
           dtype=dtypes.float32):
    # TODO(jamesqin): add multi-layer tests.
    # TODO(jamesqin): add multi-dir tests
    assert num_layers == 1
    assert num_dirs == 1
    if is_training and not np.isclose(dropout, 0):
        raise ValueError("dropout can not be 0. when test training.")

    # set graph level random seed and numpy random seed.
    random_seed.set_random_seed(0)
    np.random.seed(0)

    inputs = variable_scope.get_variable(
        "inputs",
        initializer=np.random.rand(time, batch_size,
                                   input_size).astype(dtype.as_numpy_dtype),
        dtype=dtype)
    initial_h_op = variable_scope.get_variable(
        "initial_h_op",
        initializer=np.random.rand(batch_size,
                                   num_units).astype(dtype.as_numpy_dtype),
        dtype=dtype)

    initializer = init_ops.random_uniform_initializer(-0.01,
                                                      0.01,
                                                      dtype=dtype,
                                                      seed=19980904)
    with variable_scope.variable_scope("test", initializer=initializer):
        gate_kernel = variable_scope.get_variable(
            "rnn/cudnn_compatible_gru_cell/gates/kernel",
            shape=[input_size + num_units, num_units * 2],
            dtype=dtype)
        gate_bias = variable_scope.get_variable(
            "rnn/cudnn_compatible_gru_cell/gates/bias",
            shape=[num_units * 2],
            dtype=dtype)
        candidate_inp_kernel = variable_scope.get_variable(
            "rnn/cudnn_compatible_gru_cell/candidate/input_projection/kernel",
            shape=[input_size, num_units],
            dtype=dtype)
        candidate_inp_bias = variable_scope.get_variable(
            "rnn/cudnn_compatible_gru_cell/candidate/input_projection/bias",
            shape=[num_units],
            dtype=dtype)
        candidate_hid_kernel = variable_scope.get_variable(
            "rnn/cudnn_compatible_gru_cell/candidate/hidden_projection/kernel",
            shape=[num_units, num_units],
            dtype=dtype)
        candidate_hid_bias = variable_scope.get_variable(
            "rnn/cudnn_compatible_gru_cell/candidate/hidden_projection/bias",
            shape=[num_units],
            dtype=dtype)

        cell = cudnn_rnn_ops.CudnnCompatibleGRUCell(num_units, reuse=True)
        outputs_op, h_op = rnn.dynamic_rnn(cell,
                                           inputs,
                                           initial_state=initial_h_op,
                                           dtype=dtype,
                                           time_major=True,
                                           scope=None)

    ws = [gate_kernel, candidate_inp_kernel, candidate_hid_kernel]
    bs = [gate_bias, candidate_inp_bias, candidate_hid_bias]
    # Convert to cudnn opaque param.
    format_converter = cudnn_rnn_ops.CudnnParamsFormatConverterGRU(
        num_layers, num_units, input_size)
    opaque_params = format_converter.tf_canonical_to_opaque(ws + bs)

    cu_initial_h_op = array_ops.expand_dims(initial_h_op, axis=0)
    cu_outputs_op, cu_h_op, _ = cudnn_rnn_ops._cudnn_rnn(
        inputs,
        cu_initial_h_op,
        array_ops.zeros_like(cu_initial_h_op),  # not used
        opaque_params,
        dropout=dropout,
        is_training=is_training,
        rnn_mode=cudnn_rnn_ops.CUDNN_GRU)

    if is_training:
        (inp_grad_op, hgrad_op, gk_grad_op, cik_grad_op, chk_grad_op,
         gb_grad_op, cib_grad_op, chb_grad_op) = gradients_impl.gradients(
             outputs_op, [inputs, initial_h_op] + ws + bs)

        (cu_inp_grad_op,
         cu_hgrad_op, opaque_grad_op) = gradients_impl.gradients(
             cu_outputs_op, [inputs, cu_initial_h_op, opaque_params])
        # Remove the trivial 1st dimension
        cu_hgrad_op = array_ops.squeeze(cu_hgrad_op, axis=0)

        cu_wgrad_op, cu_bgrad_op = format_converter.opaque_to_tf_canonical(
            opaque_grad_op)
        (cu_gk_grad_op, cu_cik_grad_op, cu_chk_grad_op) = cu_wgrad_op
        (cu_gb_grad_op, cu_cib_grad_op, cu_chb_grad_op) = cu_bgrad_op
        # cudnn gru has 2 biases for reset and update gates. When converting to tf
        # canonical format, the two biases are summed into one.  Thus here relevant
        # bias gradient should be halved before comparing with tf gru.
        cu_gb_grad_op *= 0.5

    init_op = variables.global_variables_initializer()
    sess.run(init_op)

    if is_training:
        outputs, h, inp_grad, hgrad, wgrad, bgrad = sess.run([
            outputs_op, h_op, inp_grad_op, hgrad_op,
            (gk_grad_op, cik_grad_op, chk_grad_op),
            (gb_grad_op, cib_grad_op, chb_grad_op)
        ])
        (cu_outputs, cu_h, cu_inp_grad, cu_hgrad, cu_wgrad,
         cu_bgrad) = sess.run([
             cu_outputs_op, cu_h_op, cu_inp_grad_op, cu_hgrad_op,
             (cu_gk_grad_op, cu_cik_grad_op, cu_chk_grad_op),
             (cu_gb_grad_op, cu_cib_grad_op, cu_chb_grad_op)
         ])
        # Remove the trivial 1st dimension
        cu_h = np.squeeze(cu_h, axis=0)

        logging.vlog(1, "outputs: %s" % outputs)
        logging.vlog(1, "cu_outputs: %s" % cu_outputs)
        logging.vlog(1, "h: %s" % h)
        logging.vlog(1, "cu_h: %s" % h)
        logging.vlog(1, "inp_grad: %s" % inp_grad)
        logging.vlog(1, "cu_inp_grad: %s" % cu_inp_grad)
        logging.vlog(1, "hgrad: %s" % hgrad)
        logging.vlog(1, "cu_hgrad: %s" % cu_hgrad)
        logging.vlog(1, "wgrad: %s" % str(wgrad))
        logging.vlog(1, "bgrad: %s" % str(bgrad))
        logging.vlog(1, "cu_wgrad: %s" % str(cu_wgrad))
        logging.vlog(1, "cu_bgrad: %s" % str(cu_bgrad))
        return (outputs, cu_outputs, h, cu_h, inp_grad, cu_inp_grad, hgrad,
                cu_hgrad, wgrad, bgrad, cu_wgrad, cu_bgrad)
    else:
        outputs, h = sess.run([outputs_op, h_op])
        cu_outputs, cu_h = sess.run([cu_outputs_op, cu_h_op])
        # Remove the trivial 1st dimension.
        cu_h = np.squeeze(cu_h, axis=0)

        logging.vlog(1, "outputs: %s" % outputs)
        logging.vlog(1, "cu_outputs: %s" % cu_outputs)
        logging.vlog(1, "h: %s" % h)
        logging.vlog(1, "cu_h: %s" % h)
    return outputs, cu_outputs, h, cu_h
Beispiel #36
0
    def testDeterministicGradients(self, data_layout, data_rank, data_type):
        with self.session(force_gpu=True):
            # Using a cached_session with force_gpu=True does not work at the time
            # of writing (2019-12-10). Before the @parameterized.named_parameters
            # decorator was added, this non-cached session context was set outside
            # the iteration loops for the parameter combinations, and so was re-used.
            seed = (hash(data_layout) % 256 + hash(data_rank) % 256 +
                    hash(data_type) % 256)
            np.random.seed(seed)
            batch_size = 10
            channel_count = 8
            data_dim = 14
            input_shape = self._makeShapeTuple(batch_size, channel_count,
                                               data_rank, data_dim,
                                               data_layout)
            bias_shape = (channel_count, )
            output_shape = input_shape
            input_val = self._randomDataOp(input_shape, data_type)
            bias_val = self._randomDataOp(bias_shape, data_type)
            data_format = self._dataFormatFromDataLayout(data_layout)
            repeat_count = 5
            if context.executing_eagerly():

                def bias_gradients(local_seed):
                    np.random.seed(local_seed)
                    upstream_gradients = self._randomDataOp(
                        output_shape, data_type)
                    with backprop.GradientTape(persistent=True) as tape:
                        tape.watch(bias_val)
                        bias_add_output = nn_ops.bias_add(
                            input_val, bias_val, data_format=data_format)
                        gradient_injector_output = bias_add_output * upstream_gradients
                    return tape.gradient(gradient_injector_output, bias_val)

                for i in range(repeat_count):
                    local_seed = seed + i  # select different upstream gradients
                    result_a = bias_gradients(local_seed)
                    result_b = bias_gradients(local_seed)
                    self.assertAllEqual(result_a, result_b)
            else:  # graph mode
                upstream_gradients = array_ops.placeholder(
                    data_type, shape=output_shape, name='upstream_gradients')
                bias_add_output = nn_ops.bias_add(input_val,
                                                  bias_val,
                                                  data_format=data_format)
                gradient_injector_output = bias_add_output * upstream_gradients
                # The gradient function behaves as if grad_ys is multiplied by the op
                # gradient result, not passing the upstram gradients through the op's
                # gradient generation graph. This is the reason for using the
                # gradient injector
                bias_gradients = gradients_impl.gradients(
                    gradient_injector_output,
                    bias_val,
                    grad_ys=None,
                    colocate_gradients_with_ops=True)[0]
                for i in range(repeat_count):
                    feed_dict = {
                        upstream_gradients: self._randomNDArray(output_shape)
                    }
                    result_a = bias_gradients.eval(feed_dict=feed_dict)
                    result_b = bias_gradients.eval(feed_dict=feed_dict)
                    self.assertAllEqual(result_a, result_b)
Beispiel #37
0
    def testCtcLossDenseUniqueFastPathWithBlankIndexIsSameAsCtcLoss(self):
        random_seed.set_random_seed(5)

        batch_size = 8
        num_labels = 6
        label_length = 5
        num_frames = 12
        logits = random_ops.random_uniform(
            [num_frames, batch_size, num_labels])
        labels = random_ops.random_uniform([batch_size, label_length],
                                           minval=0,
                                           maxval=num_labels - 1,
                                           dtype=dtypes.int64)

        label_lengths = random_ops.random_uniform([batch_size],
                                                  minval=2,
                                                  maxval=label_length,
                                                  dtype=dtypes.int64)
        label_mask = array_ops.sequence_mask(label_lengths,
                                             maxlen=label_length,
                                             dtype=label_lengths.dtype)
        labels *= label_mask

        logit_lengths = [num_frames] * batch_size

        tf_ctc_loss_labels = math_ops.cast(labels, dtypes.int32)
        tf_ctc_loss_labels = ctc_ops.dense_labels_to_sparse(
            tf_ctc_loss_labels, label_lengths)

        tf_nn_ctc_loss = ctc_ops.ctc_loss(labels=tf_ctc_loss_labels,
                                          inputs=logits,
                                          sequence_length=logit_lengths,
                                          time_major=True)
        tf_nn_ctc_grads = gradients_impl.gradients(tf_nn_ctc_loss, [logits])[0]

        # Shift the blank logits/labels to be somewhere in the middle.
        blank_index = 2
        shifted_logits = array_ops.concat([
            logits[:, :, :blank_index],
            logits[:, :, -1:],
            logits[:, :, blank_index:-1],
        ],
                                          axis=2)
        shifted_labels = array_ops.where_v2(labels < blank_index, labels,
                                            labels + 1)

        ctc_loss = ctc_ops.ctc_loss_dense(
            labels=shifted_labels,
            logits=shifted_logits,
            label_length=label_lengths,
            logit_length=logit_lengths,
            blank_index=blank_index,
            unique=ctc_ops.ctc_unique_labels(shifted_labels))
        ctc_loss_grads = gradients_impl.gradients(ctc_loss, [logits])[0]

        with self.cached_session() as sess:
            for _ in range(32):
                self.assertAllClose(*self.evaluate([ctc_loss, tf_nn_ctc_loss]))
                self.assertAllClose(*self.evaluate(
                    [ctc_loss_grads, tf_nn_ctc_grads]),
                                    rtol=2e-06,
                                    atol=2e-06)
    def testFunctions(self):
        dtype = dtypes.float32

        @function.Defun(dtype, dtype, dtype, dtype)
        def Grad(x, y, dout1, dout2):  # pylint: disable=unused-argument
            # Return the inputs for simplicity of testing. The correct return value
            # would be (dout1 + dout2, dout1 - dout2)
            return x, y

        @function.Defun(dtype, dtype, grad_func=Grad)
        def FuncWithGrad(x, y):
            return x + y, x - y

        @function.Defun(dtypes.int32)
        def ExternalTensorFunc(x):
            # c must be defined in the containing graph
            return x + c

        @function.Defun(dtypes.int32, dtypes.int32)
        def OuterFunc(x, y):
            @function.Defun(dtypes.int32)
            def InnerFunc(x):
                return x + x

            return InnerFunc(x) + y

        # Create graph with function calls and export to GraphDef
        with ops.Graph().as_default() as g1:
            p1 = array_ops.placeholder(dtype, name="p1")
            p2 = array_ops.placeholder(dtype, name="p2")
            # pylint: disable=unexpected-keyword-arg
            a, b = FuncWithGrad(p1, p2, name="f")

            c = constant_op.constant(10, dtype=dtypes.int32)
            ExternalTensorFunc(1, name="external")

            OuterFunc(10, 1, name="outer")
            # pylint: enable=unexpected-keyword-arg

        gdef = g1.as_graph_def()

        # Import GraphDef into new graph, add imported gradients, and test that
        # imported functions can be run
        with ops.Graph().as_default() as g2:
            p1, p2, a, b = importer.import_graph_def(
                gdef, return_elements=["p1:0", "p2:0", "f:0", "f:1"], name="")
            grad = gradients_impl.gradients([a], [p1, p2])

            with self.test_session(graph=g2) as sess:
                feed_dict = {p1: 1, p2: 2}
                a_val, b_val, grad_val = sess.run([a, b, grad],
                                                  feed_dict=feed_dict)
                self.assertEqual(a_val, 3.0)
                self.assertEqual(b_val, -1.0)
                # Grad function returns inputs values for testing
                self.assertEqual(grad_val, [1.0, 2.0])
                self.assertEqual(sess.run("external:0"), 11)
                self.assertEqual(sess.run("outer:0"), 21)

        # Export the new graph and reimport to test that imported functions can be
        # successfully exported/imported again
        gdef = g2.as_graph_def()
        with ops.Graph().as_default() as g3:
            p1, p2, a, b = importer.import_graph_def(
                gdef, return_elements=["p1:0", "p2:0", "f:0", "f:1"], name="")
            # Create new gradient functions (in additional to the imported gradient
            # functions created in g2).
            grad = gradients_impl.gradients([a], [p1, p2])

            with self.test_session(graph=g3) as sess:
                feed_dict = {p1: 1, p2: 2}
                a_val, b_val, grad_val = sess.run([a, b, grad],
                                                  feed_dict=feed_dict)
                self.assertEqual(a_val, 3.0)
                self.assertEqual(b_val, -1.0)
                self.assertEqual(grad_val, [1.0, 2.0])
                self.assertEqual(sess.run("external:0"), 11)
                self.assertEqual(sess.run("outer:0"), 21)
Beispiel #39
0
def wasserstein_gradient_penalty(
    real_data,
    generated_data,
    generator_inputs,
    discriminator_fn,
    discriminator_scope,
    epsilon=1e-10,
    weights=1.0,
    scope=None,
    loss_collection=ops.GraphKeys.LOSSES,
    reduction=losses.Reduction.SUM_BY_NONZERO_WEIGHTS,
    add_summaries=False):
  """The gradient penalty for the Wasserstein discriminator loss.
  See `Improved Training of Wasserstein GANs`
  (https://arxiv.org/abs/1704.00028) for more details.
  Args:
    real_data: Real data.
    generated_data: Output of the generator.
    generator_inputs: Exact argument to pass to the generator, which is used
      as optional conditioning to the discriminator.
    discriminator_fn: A discriminator function that conforms to TFGAN API.
    discriminator_scope: If not `None`, reuse discriminators from this scope.
    epsilon: A small positive number added for numerical stability when
      computing the gradient norm.
    weights: Optional `Tensor` whose rank is either 0, or the same rank as
      `real_data` and `generated_data`, and must be broadcastable to
      them (i.e., all dimensions must be either `1`, or the same as the
      corresponding dimension).
    scope: The scope for the operations performed in computing the loss.
    loss_collection: collection to which this loss will be added.
    reduction: A `tf.losses.Reduction` to apply to loss.
    add_summaries: Whether or not to add summaries for the loss.
  Returns:
    A loss Tensor. The shape depends on `reduction`.
  Raises:
    ValueError: If the rank of data Tensors is unknown.
  """
  real_data = ops.convert_to_tensor(real_data)
  generated_data = ops.convert_to_tensor(generated_data)
  if real_data.shape.ndims is None:
    raise ValueError('`real_data` can\'t have unknown rank.')
  if generated_data.shape.ndims is None:
    raise ValueError('`generated_data` can\'t have unknown rank.')

  differences = generated_data - real_data
  batch_size = differences.shape[0].value or array_ops.shape(differences)[0]
  alpha_shape = [batch_size] + [1] * (differences.shape.ndims - 1)
  alpha = random_ops.random_uniform(shape=alpha_shape)
  interpolates = real_data + (alpha * differences)

  # Reuse variables if a discriminator scope already exists.
  reuse = False if discriminator_scope is None else True
  with variable_scope.variable_scope(discriminator_scope, 'gpenalty_dscope',
                                     reuse=reuse):
    disc_interpolates = discriminator_fn(interpolates, generator_inputs)

  if isinstance(disc_interpolates, tuple):
    # ACGAN case: disc outputs more than one tensor
    disc_interpolates = disc_interpolates[0]

  gradients = gradients_impl.gradients(disc_interpolates, interpolates)[0]
  gradient_squares = math_ops.reduce_sum(
      math_ops.square(gradients), axis=list(range(1, gradients.shape.ndims)))
  # Propagate shape information, if possible.
  if isinstance(batch_size, int):
    gradient_squares.set_shape([
        batch_size] + gradient_squares.shape.as_list()[1:])
  # For numerical stability, add epsilon to the sum before taking the square
  # root. Note tf.norm does not add epsilon.
  slopes = math_ops.sqrt(gradient_squares + epsilon)
  penalties = math_ops.square(slopes - 1.0)
  penalty = losses.compute_weighted_loss(
      penalties, weights, scope=scope, loss_collection=loss_collection,
      reduction=reduction)

  if add_summaries:
    summary.scalar('gradient_penalty_loss', penalty)

  return penalty
Beispiel #40
0
 def step(c):
     x = array_ops.identity(42.)
     y = comm_fn(x) * c
     return gradients_impl.gradients(y, [x])[0]
Beispiel #41
0
 def testNoIntegerGradient5(self):
     k = constant_op.constant([3, 4])
     m = k * k
     n = m * m
     dn_dk, = gradients_impl.gradients(n, k)
     self.assertIsNone(dn_dk)
Beispiel #42
0
 def testNoIntegerGradient4(self):
     k = constant_op.constant([3, 4])
     m = k * k * k
     dm_dk, = gradients_impl.gradients(m, k)
     self.assertIsNone(dm_dk)
Beispiel #43
0
def RunLSTM(sess,
            num_units,
            input_size,
            batch_size,
            time,
            num_layers=1,
            is_training=True,
            dropout=0.,
            num_dirs=True,
            dtype=dtypes.float32):
    # TODO(jamesqin): add multi-layer tests.
    # TODO(jamesqin): add multi-dir tests
    assert num_layers == 1
    assert num_dirs == 1
    if is_training and not np.isclose(dropout, 0):
        raise ValueError("dropout can not be 0. when test training.")

    # set graph level random seed and numpy random seed.
    random_seed.set_random_seed(0)
    np.random.seed(0)

    inputs = variable_scope.get_variable(
        "inputs",
        initializer=np.random.rand(time, batch_size,
                                   input_size).astype(dtype.as_numpy_dtype),
        dtype=dtype)
    initial_h_op = variable_scope.get_variable(
        "initial_h_op",
        initializer=np.random.rand(batch_size,
                                   num_units).astype(dtype.as_numpy_dtype),
        dtype=dtype)
    initial_c_op = variable_scope.get_variable(
        "initial_c_op",
        initializer=np.random.rand(batch_size,
                                   num_units).astype(dtype.as_numpy_dtype),
        dtype=dtype)

    initializer = init_ops.random_uniform_initializer(-0.01,
                                                      0.01,
                                                      dtype=dtype,
                                                      seed=19980904)

    with variable_scope.variable_scope("test", initializer=initializer):
        w = variable_scope.get_variable(
            "rnn/lstm_cell/kernel",
            shape=[input_size + num_units, num_units * 4],
            dtype=dtype)
        b = variable_scope.get_variable("rnn/lstm_cell/bias",
                                        shape=[num_units * 4],
                                        dtype=dtype)

        # canonical lstm. must set forget_bias to 0. to align with cudnn lstm.
        cell = rnn_cell_impl.LSTMCell(num_units, forget_bias=0., reuse=True)
        outputs_op, state_tuple_op = rnn.dynamic_rnn(
            cell,
            inputs,
            initial_state=rnn_cell_impl.LSTMStateTuple(h=initial_h_op,
                                                       c=initial_c_op),
            dtype=dtype,
            time_major=True,
            scope=None)

    # Convert to cudnn opaque param.
    format_converter = cudnn_rnn_ops.CudnnParamsFormatConverterLSTM(
        num_layers, num_units, input_size)
    opaque_params = format_converter.tf_canonical_to_opaque([w, b])

    cu_initial_h_op = array_ops.expand_dims(initial_h_op, axis=0)
    cu_initial_c_op = array_ops.expand_dims(initial_c_op, axis=0)
    cu_outputs_op, cu_h_op, cu_c_op = cudnn_rnn_ops._cudnn_rnn(
        inputs,
        cu_initial_h_op,
        cu_initial_c_op,
        opaque_params,
        dropout=dropout,
        is_training=is_training,
        rnn_mode=cudnn_rnn_ops.CUDNN_LSTM)
    # Remove the trivial 1st dimension.
    cu_state_tuple_op = rnn_cell_impl.LSTMStateTuple(
        c=array_ops.squeeze(cu_c_op, axis=0),
        h=array_ops.squeeze(cu_h_op, axis=0))

    if is_training:
        (inp_grad_op, hgrad_op, cgrad_op,
         wgrad_op, bgrad_op) = gradients_impl.gradients(
             outputs_op, [inputs, initial_h_op, initial_c_op, w, b])

        (cu_inp_grad_op, cu_hgrad_op,
         cu_cgrad_op, opaque_grad_op) = gradients_impl.gradients(
             cu_outputs_op,
             [inputs, cu_initial_h_op, cu_initial_c_op, opaque_params])
        # Remove the trivial 1st dimension
        cu_hgrad_op = array_ops.squeeze(cu_hgrad_op, axis=0)
        # Remove the trivial 1st dimension
        cu_cgrad_op = array_ops.squeeze(cu_cgrad_op, axis=0)

        cu_wgrad_op, cu_bgrad_op = format_converter.opaque_to_tf_canonical(
            opaque_grad_op)
        cu_wgrad_op = cu_wgrad_op[0]
        cu_bgrad_op = cu_bgrad_op[0]
        # cudnn lstm has 2 biases each gate. When converting to tf canonical format,
        # the two biases are summed into one. Thus here bias gradient should be
        # halved when comparing with tf lstm.
        cu_bgrad_op *= 0.5

    init_op = variables.global_variables_initializer()
    sess.run(init_op)

    if is_training:
        outputs, state_tuple, inp_grad, state_grad, wgrad, bgrad = sess.run([
            outputs_op, state_tuple_op, inp_grad_op, (hgrad_op, cgrad_op),
            wgrad_op, bgrad_op
        ])
        (cu_outputs, cu_state_tuple, cu_inp_grad, cu_state_grad, cu_wgrad,
         cu_bgrad) = sess.run([
             cu_outputs_op, cu_state_tuple_op, cu_inp_grad_op,
             (cu_hgrad_op, cu_cgrad_op), cu_wgrad_op, cu_bgrad_op
         ])

        logging.vlog(1, "outputs: %s" % outputs)
        logging.vlog(1, "cu_outputs: %s" % cu_outputs)
        logging.vlog(1, "state_tuple: %s" % str(state_tuple))
        logging.vlog(1, "cu_state_tuple: %s" % str(cu_state_tuple))
        logging.vlog(1, "inp_grad: %s" % inp_grad)
        logging.vlog(1, "cu_inp_grad: %s" % cu_inp_grad)
        logging.vlog(1, "state_grad: %s" % str(state_grad))
        logging.vlog(1, "cu_state_grad: %s" % str(cu_state_grad))
        logging.vlog(1, "wgrad: %s" % str(wgrad))
        logging.vlog(1, "bgrad: %s" % str(bgrad))
        logging.vlog(1, "cu_wgrad: %s" % str(cu_wgrad))
        logging.vlog(1, "cu_bgrad: %s" % str(cu_bgrad))
        return (outputs, cu_outputs, state_tuple, cu_state_tuple, inp_grad,
                cu_inp_grad, state_grad, cu_state_grad, wgrad, bgrad, cu_wgrad,
                cu_bgrad)
    else:
        outputs, state_tuple = sess.run([outputs_op, state_tuple_op])
        cu_outputs, cu_state_tuple = sess.run(
            [cu_outputs_op, cu_state_tuple_op])

        logging.vlog(1, "outputs: %s" % outputs)
        logging.vlog(1, "cu_outputs: %s" % cu_outputs)
        logging.vlog(1, "state_tuple: %s" % str(state_tuple))
        logging.vlog(1, "cu_state_tuple: %s" % str(cu_state_tuple))
    return outputs, cu_outputs, state_tuple, cu_state_tuple
    def test_vimco_and_gradient(self):

        with self.test_session() as sess:
            dims = 5  # Dimension
            num_draws = int(20)
            num_batch_draws = int(3)
            seed = 1

            f = lambda logu: cd.kl_reverse(logu, self_normalized=False)
            np_f = lambda logu: -logu

            p = mvn_full_lib.MultivariateNormalFullCovariance(
                covariance_matrix=tridiag(
                    dims, diag_value=1, offdiag_value=0.5))

            # Variance is very high when approximating Forward KL, so we make
            # scale_diag larger than in test_kl_reverse_multidim. This ensures q
            # "covers" p and thus Var_q[p/q] is smaller.
            s = array_ops.constant(1.)
            q = mvn_diag_lib.MultivariateNormalDiag(
                scale_diag=array_ops.tile([s], [dims]))

            vimco = cd.csiszar_vimco(f=f,
                                     p_log_prob=p.log_prob,
                                     q=q,
                                     num_draws=num_draws,
                                     num_batch_draws=num_batch_draws,
                                     seed=seed)

            x = q.sample(sample_shape=[num_draws, num_batch_draws], seed=seed)
            x = array_ops.stop_gradient(x)
            logu = p.log_prob(x) - q.log_prob(x)
            f_log_sum_u = f(cd.csiszar_vimco_helper(logu)[0])

            grad_sum = lambda fs: gradients_impl.gradients(fs, s)[0]

            def jacobian(x):
                # Warning: this function is slow and may not even finish if prod(shape)
                # is larger than, say, 100.
                shape = x.shape.as_list()
                assert all(s is not None for s in shape)
                x = array_ops.reshape(x, shape=[-1])
                r = [grad_sum(x[i]) for i in range(np.prod(shape))]
                return array_ops.reshape(array_ops.stack(r), shape=shape)

            [
                logu_,
                jacobian_logqx_,
                vimco_,
                grad_vimco_,
                f_log_sum_u_,
                grad_mean_f_log_sum_u_,
            ] = sess.run([
                logu,
                jacobian(q.log_prob(x)),
                vimco,
                grad_sum(vimco),
                f_log_sum_u,
                grad_sum(f_log_sum_u) / num_batch_draws,
            ])

            np_log_avg_u, np_log_sooavg_u = self._csiszar_vimco_helper(logu_)

            # Test VIMCO loss is correct.
            self.assertAllClose(np_f(np_log_avg_u).mean(axis=0),
                                vimco_,
                                rtol=1e-5,
                                atol=0.)

            # Test gradient of VIMCO loss is correct.
            #
            # To make this computation we'll inject two gradients from TF:
            # - grad[mean(f(log(sum(p(x)/q(x)))))]
            # - jacobian[log(q(x))].
            #
            # We now justify why using these (and only these) TF values for
            # ground-truth does not undermine the completeness of this test.
            #
            # Regarding `grad_mean_f_log_sum_u_`, note that we validate the
            # correctness of the zero-th order derivative (for each batch member).
            # Since `cd.csiszar_vimco_helper` itself does not manipulate any gradient
            # information, we can safely rely on TF.
            self.assertAllClose(np_f(np_log_avg_u),
                                f_log_sum_u_,
                                rtol=1e-4,
                                atol=0.)
            #
            # Regarding `jacobian_logqx_`, note that testing the gradient of
            # `q.log_prob` is outside the scope of this unit-test thus we may safely
            # use TF to find it.

            # The `mean` is across batches and the `sum` is across iid samples.
            np_grad_vimco = (grad_mean_f_log_sum_u_ + np.mean(np.sum(
                jacobian_logqx_ * (np_f(np_log_avg_u) - np_f(np_log_sooavg_u)),
                axis=0),
                                                              axis=0))

            self.assertAllClose(np_grad_vimco, grad_vimco_, rtol=1e-5, atol=0.)
    def test_score_trick(self):

        with self.test_session() as sess:
            d = 5  # Dimension
            num_draws = int(1e5)
            seed = 1

            p = mvn_full_lib.MultivariateNormalFullCovariance(
                covariance_matrix=tridiag(d, diag_value=1, offdiag_value=0.5))

            # Variance is very high when approximating Forward KL, so we make
            # scale_diag larger than in test_kl_reverse_multidim. This ensures q
            # "covers" p and thus Var_q[p/q] is smaller.
            s = array_ops.constant(1.)
            q = mvn_diag_lib.MultivariateNormalDiag(
                scale_diag=array_ops.tile([s], [d]))

            approx_kl = cd.monte_carlo_csiszar_f_divergence(
                f=cd.kl_reverse,
                p_log_prob=p.log_prob,
                q=q,
                num_draws=num_draws,
                seed=seed)

            approx_kl_self_normalized = cd.monte_carlo_csiszar_f_divergence(
                f=lambda logu: cd.kl_reverse(logu, self_normalized=True),
                p_log_prob=p.log_prob,
                q=q,
                num_draws=num_draws,
                seed=seed)

            approx_kl_score_trick = cd.monte_carlo_csiszar_f_divergence(
                f=cd.kl_reverse,
                p_log_prob=p.log_prob,
                q=q,
                num_draws=num_draws,
                use_reparametrization=False,
                seed=seed)

            approx_kl_self_normalized_score_trick = (
                cd.monte_carlo_csiszar_f_divergence(
                    f=lambda logu: cd.kl_reverse(logu, self_normalized=True),
                    p_log_prob=p.log_prob,
                    q=q,
                    num_draws=num_draws,
                    use_reparametrization=False,
                    seed=seed))

            exact_kl = kullback_leibler.kl_divergence(q, p)

            grad_sum = lambda fs: gradients_impl.gradients(fs, s)[0]

            [
                approx_kl_grad_,
                approx_kl_self_normalized_grad_,
                approx_kl_score_trick_grad_,
                approx_kl_self_normalized_score_trick_grad_,
                exact_kl_grad_,
                approx_kl_,
                approx_kl_self_normalized_,
                approx_kl_score_trick_,
                approx_kl_self_normalized_score_trick_,
                exact_kl_,
            ] = sess.run([
                grad_sum(approx_kl),
                grad_sum(approx_kl_self_normalized),
                grad_sum(approx_kl_score_trick),
                grad_sum(approx_kl_self_normalized_score_trick),
                grad_sum(exact_kl),
                approx_kl,
                approx_kl_self_normalized,
                approx_kl_score_trick,
                approx_kl_self_normalized_score_trick,
                exact_kl,
            ])

            # Test average divergence.
            self.assertAllClose(approx_kl_, exact_kl_, rtol=0.02, atol=0.)

            self.assertAllClose(approx_kl_self_normalized_,
                                exact_kl_,
                                rtol=0.08,
                                atol=0.)

            self.assertAllClose(approx_kl_score_trick_,
                                exact_kl_,
                                rtol=0.02,
                                atol=0.)

            self.assertAllClose(approx_kl_self_normalized_score_trick_,
                                exact_kl_,
                                rtol=0.08,
                                atol=0.)

            # Test average gradient-divergence.
            self.assertAllClose(approx_kl_grad_,
                                exact_kl_grad_,
                                rtol=0.007,
                                atol=0.)

            self.assertAllClose(approx_kl_self_normalized_grad_,
                                exact_kl_grad_,
                                rtol=0.011,
                                atol=0.)

            self.assertAllClose(approx_kl_score_trick_grad_,
                                exact_kl_grad_,
                                rtol=0.018,
                                atol=0.)

            self.assertAllClose(approx_kl_self_normalized_score_trick_grad_,
                                exact_kl_grad_,
                                rtol=0.017,
                                atol=0.)
Beispiel #46
0
 def _testGradientVariableSize(self):
     with self.test_session(use_gpu=True):
         inp = constant_op.constant([1.0, 2.0, 3.0], name="in")
         out = array_ops.slice(inp, [1], [-1])
         grad_actual = gradients_impl.gradients(out, inp)[0].eval()
     self.assertAllClose([0., 1., 1.], grad_actual)
Beispiel #47
0
 def _make_tensor(self):
     x = array_ops.placeholder(dtypes.float64, (3, 1))
     w = array_ops.constant(npr.RandomState(0).randn(3, 3))
     y = math_ops.matmul(w, x)
     g = gradients_impl.gradients(y, x)[0]
     return g
Beispiel #48
0
    def testLSTMFusedSequenceLengths(self):
        """Verify proper support for sequence lengths in LSTMBlockFusedCell."""
        with self.test_session(use_gpu=self._use_gpu) as sess:
            batch_size = 3
            input_size = 4
            cell_size = 5
            max_sequence_length = 6

            inputs = []
            for _ in range(max_sequence_length):
                inp = ops.convert_to_tensor(np.random.randn(
                    batch_size, input_size),
                                            dtype=dtypes.float32)
                inputs.append(inp)
            seq_lengths = constant_op.constant([3, 4, 5])

            initializer = init_ops.random_uniform_initializer(-0.01,
                                                              0.01,
                                                              seed=19890213)
            with variable_scope.variable_scope("basic",
                                               initializer=initializer):
                cell = core_rnn_cell_impl.BasicLSTMCell(cell_size,
                                                        state_is_tuple=True)
                outputs, state = core_rnn.static_rnn(
                    cell,
                    inputs,
                    dtype=dtypes.float32,
                    sequence_length=seq_lengths)
                sess.run([variables.global_variables_initializer()])
                basic_outputs, basic_state = sess.run([outputs, state[0]])
                basic_grads = sess.run(
                    gradients_impl.gradients(outputs, inputs))
                basic_wgrads = sess.run(
                    gradients_impl.gradients(outputs,
                                             variables.trainable_variables()))

            with variable_scope.variable_scope("fused",
                                               initializer=initializer):
                cell = lstm_ops.LSTMBlockFusedCell(cell_size,
                                                   cell_clip=0,
                                                   use_peephole=False)
                outputs, state = cell(inputs,
                                      dtype=dtypes.float32,
                                      sequence_length=seq_lengths)

                sess.run([variables.global_variables_initializer()])
                fused_outputs, fused_state = sess.run([outputs, state[0]])
                fused_grads = sess.run(
                    gradients_impl.gradients(outputs, inputs))
                fused_vars = [
                    v for v in variables.trainable_variables()
                    if v.name.startswith("fused/")
                ]
                fused_wgrads = sess.run(
                    gradients_impl.gradients(outputs, fused_vars))

            self.assertAllClose(basic_outputs, fused_outputs)
            self.assertAllClose(basic_state, fused_state)
            self.assertAllClose(basic_grads, fused_grads)
            for basic, fused in zip(basic_wgrads, fused_wgrads):
                self.assertAllClose(basic, fused, rtol=1e-2, atol=1e-2)

            # Verify that state propagation works if we turn our sequence into
            # tiny (single-time) subsequences, i.e. unfuse the cell
            with variable_scope.variable_scope("unfused",
                                               initializer=initializer) as vs:
                cell = lstm_ops.LSTMBlockFusedCell(cell_size,
                                                   cell_clip=0,
                                                   use_peephole=False)
                outputs = []
                state = None
                for i, inp in enumerate(inputs):
                    lengths = [int(i < l) for l in seq_lengths.eval()]
                    output, state = cell([inp],
                                         initial_state=state,
                                         dtype=dtypes.float32,
                                         sequence_length=lengths)
                    vs.reuse_variables()
                    outputs.append(output[0])
                outputs = array_ops.stack(outputs)

                sess.run([variables.global_variables_initializer()])
                unfused_outputs, unfused_state = sess.run([outputs, state[0]])
                unfused_grads = sess.run(
                    gradients_impl.gradients(outputs, inputs))
                unfused_vars = [
                    v for v in variables.trainable_variables()
                    if v.name.startswith("unfused/")
                ]
                unfused_wgrads = sess.run(
                    gradients_impl.gradients(outputs, unfused_vars))

            self.assertAllClose(basic_outputs, unfused_outputs)
            self.assertAllClose(basic_state, unfused_state)
            self.assertAllClose(basic_grads, unfused_grads)
            for basic, unfused in zip(basic_wgrads, unfused_wgrads):
                self.assertAllClose(basic, unfused, rtol=1e-2, atol=1e-2)
Beispiel #49
0
    def testDerivativeOfBlockGRUToGRUCellMultiSteps(self):
        batch_size = 2
        cell_size = 3
        input_size = 4
        time_steps = 2
        with self.test_session(use_gpu=True, graph=ops.Graph()) as sess:
            # Random initializers.
            seed = 1994
            initializer = init_ops.random_uniform_initializer(-0.01,
                                                              0.01,
                                                              seed=seed)
            np.random.seed(seed)

            # Inputs
            concat_x = array_ops.placeholder(dtypes.float32,
                                             shape=(time_steps, batch_size,
                                                    input_size))
            h = array_ops.zeros([batch_size, cell_size])

            # Values for the inputs.
            x_values = np.random.rand(time_steps, batch_size, input_size)
            h_value = np.random.rand(batch_size, cell_size)
            feeds = {concat_x: x_values, h: h_value}

            # Gradients from the block GRU cell implementation.
            with vs.variable_scope("block", initializer=initializer):
                cell = gru_ops.GRUBlockCell(cell_size)

                outputs_dynamic, _ = rnn.dynamic_rnn(cell,
                                                     inputs=concat_x,
                                                     initial_state=h,
                                                     time_major=True,
                                                     dtype=dtypes.float32)
                grad_output_wrt_x = gradients_impl.gradients(
                    [outputs_dynamic[0]], concat_x)
                grad_output_wrt_h = gradients_impl.gradients(
                    [outputs_dynamic[0]], h)

                sess.run([variables.global_variables_initializer()])
                block_grad_res_x, block_grad_res_h = sess.run(
                    [grad_output_wrt_x, grad_output_wrt_h], feeds)

            # Gradients from the basic GRU cell implementation.
            with vs.variable_scope("basic", initializer=initializer):
                cell = rnn_cell.GRUCell(cell_size)

                outputs_dynamic, _ = rnn.dynamic_rnn(cell,
                                                     inputs=concat_x,
                                                     initial_state=h,
                                                     time_major=True,
                                                     dtype=dtypes.float32)
                grad_output_wrt_x = gradients_impl.gradients(
                    [outputs_dynamic[0]], concat_x)
                grad_output_wrt_h = gradients_impl.gradients(
                    [outputs_dynamic[0]], h)

                sess.run([variables.global_variables_initializer()])
                basic_grad_res_x, basic_grad_res_h = sess.run(
                    [grad_output_wrt_x, grad_output_wrt_h], feeds)

        # Check derivatives values of the outputs wrt to x.
        self.assertEqual(len(block_grad_res_x), len(basic_grad_res_x))

        # Check derivatives values of the outputs wrt to h.
        for block, basic in zip(block_grad_res_x, basic_grad_res_x):
            self.assertAllClose(block, basic)

        # Check derivatives values of the outputs wrt to x.
        self.assertEqual(len(block_grad_res_h), len(basic_grad_res_h))

        # Check derivatives values of the outputs wrt to h.
        for block, basic in zip(block_grad_res_h, basic_grad_res_h):
            self.assertAllClose(block, basic)
Beispiel #50
0
    def testLSTMBasicToBlockPeeping(self):
        with self.test_session(use_gpu=self._use_gpu) as sess:
            batch_size = 2
            input_size = 3
            cell_size = 4
            sequence_length = 5

            inputs = []
            for _ in range(sequence_length):
                inp = ops.convert_to_tensor(np.random.randn(
                    batch_size, input_size),
                                            dtype=dtypes.float32)
                inputs.append(inp)

            initializer = init_ops.random_uniform_initializer(-0.01,
                                                              0.01,
                                                              seed=19890212)
            with variable_scope.variable_scope("basic",
                                               initializer=initializer):
                cell = core_rnn_cell_impl.LSTMCell(cell_size,
                                                   use_peepholes=True,
                                                   state_is_tuple=True)
                outputs, state = core_rnn.static_rnn(cell,
                                                     inputs,
                                                     dtype=dtypes.float32)

                sess.run([variables.global_variables_initializer()])
                basic_outputs, basic_state = sess.run([outputs, state[0]])
                basic_grads = sess.run(
                    gradients_impl.gradients(outputs, inputs))
                basic_wgrads = sess.run(
                    gradients_impl.gradients(outputs,
                                             variables.trainable_variables()))

            with variable_scope.variable_scope("block",
                                               initializer=initializer):
                w = variable_scope.get_variable(
                    "w",
                    shape=[input_size + cell_size, cell_size * 4],
                    dtype=dtypes.float32)
                b = variable_scope.get_variable(
                    "b",
                    shape=[cell_size * 4],
                    dtype=dtypes.float32,
                    initializer=init_ops.zeros_initializer())

                wci = variable_scope.get_variable("wci",
                                                  shape=[cell_size],
                                                  dtype=dtypes.float32)
                wcf = variable_scope.get_variable("wcf",
                                                  shape=[cell_size],
                                                  dtype=dtypes.float32)
                wco = variable_scope.get_variable("wco",
                                                  shape=[cell_size],
                                                  dtype=dtypes.float32)

                _, _, _, _, _, _, outputs = block_lstm(ops.convert_to_tensor(
                    sequence_length, dtype=dtypes.int64),
                                                       inputs,
                                                       w,
                                                       b,
                                                       wci=wci,
                                                       wcf=wcf,
                                                       wco=wco,
                                                       cell_clip=0,
                                                       use_peephole=True)

                sess.run([variables.global_variables_initializer()])
                block_outputs = sess.run(outputs)
                block_grads = sess.run(
                    gradients_impl.gradients(outputs, inputs))
                block_wgrads = sess.run(
                    gradients_impl.gradients(outputs, [w, b, wci, wcf, wco]))

            self.assertAllClose(basic_outputs, block_outputs)
            self.assertAllClose(basic_grads, block_grads)
            for basic, block in zip(basic_wgrads, block_wgrads):
                self.assertAllClose(basic, block, rtol=1e-2, atol=1e-2)

            with variable_scope.variable_scope("fused",
                                               initializer=initializer):
                cell = lstm_ops.LSTMBlockFusedCell(cell_size,
                                                   cell_clip=0,
                                                   use_peephole=True)
                outputs, state = cell(inputs, dtype=dtypes.float32)

                sess.run([variables.global_variables_initializer()])
                fused_outputs, fused_state = sess.run([outputs, state[0]])
                fused_grads = sess.run(
                    gradients_impl.gradients(outputs, inputs))
                fused_vars = [
                    v for v in variables.trainable_variables()
                    if v.name.startswith("fused/")
                ]
                fused_wgrads = sess.run(
                    gradients_impl.gradients(outputs, fused_vars))

            self.assertAllClose(basic_outputs, fused_outputs)
            self.assertAllClose(basic_state, fused_state)
            self.assertAllClose(basic_grads, fused_grads)
            for basic, fused in zip(basic_wgrads, fused_wgrads):
                self.assertAllClose(basic, fused, rtol=1e-2, atol=1e-2)
Beispiel #51
0
 def _get_grads_lists_empirical(self, tensors):
   grads_flat = gradients_impl.gradients(self._layers.total_loss(),
                                         nest.flatten(tensors))
   grads_all = nest.pack_sequence_as(tensors, grads_flat)
   return tuple((grad,) for grad in grads_all)
Beispiel #52
0
    def testHigherRank(self):
        # We check that scalar and empty indices shapes work as well
        shape = (2, 1, 3, 2)
        for indices_shape in (), (0, ), (2, 0), (2, 3):
            for dtype in _TEST_TYPES:
                for axis in range(len(shape)):
                    params = self._buildParams(np.random.randn(*shape), dtype)
                    indices = np.random.randint(shape[axis],
                                                size=indices_shape)
                    with self.cached_session(use_gpu=True) as sess:
                        tf_params = constant_op.constant(params)
                        tf_indices = constant_op.constant(indices)
                        # Check that both positive and negative indices for axis work.
                        tf_axis = constant_op.constant(axis)
                        tf_negative_axis = constant_op.constant(-len(shape) +
                                                                axis)
                        gather = array_ops.gather(tf_params,
                                                  tf_indices,
                                                  axis=tf_axis)
                        gather_negative_axis = array_ops.gather(
                            tf_params, tf_indices, axis=tf_negative_axis)
                        gather_value, gather_negative_axis_value = sess.run(
                            [gather, gather_negative_axis])
                        gather_np = np.take(params, indices, axis)
                        self.assertAllEqual(gather_np, gather_value)
                        self.assertAllEqual(gather_np,
                                            gather_negative_axis_value)
                        expected_shape = (params.shape[:axis] + indices.shape +
                                          params.shape[axis + 1:])
                        self.assertEqual(expected_shape, gather.shape)
                        self.assertEqual(expected_shape,
                                         gather_negative_axis.shape)

                        # Test gradients
                        gather_grad = np.random.randn(
                            *gather.get_shape().as_list()).astype(
                                dtype.as_numpy_dtype)
                        if dtype.is_complex:
                            gather_grad -= 1j * gather_grad
                        params_grad, indices_grad, axis_grad = gradients_impl.gradients(
                            gather, [tf_params, tf_indices, tf_axis],
                            gather_grad)
                        self.assertEqual(indices_grad, None)
                        self.assertEqual(axis_grad, None)
                        if dtype.is_integer:
                            self.assertEqual(params_grad, None)
                            continue
                        # For axis 0, we are able to create an efficient IndexedSlices for
                        # the gradient.
                        if axis == 0:
                            self.assertEqual(type(params_grad),
                                             ops.IndexedSlices)
                            params_grad = ops.convert_to_tensor(params_grad)
                        correct_params_grad = np.zeros(shape).astype(
                            dtype.as_numpy_dtype)
                        outer_dims = axis
                        inner_dims = len(shape) - axis - 1
                        gather_grad = gather_grad.reshape(shape[:axis] +
                                                          (indices.size, ) +
                                                          shape[axis + 1:])
                        for source_index, dest_index in enumerate(
                                indices.flat):
                            dest_slice = ((slice(None), ) * outer_dims +
                                          (dest_index, ) +
                                          (slice(None), ) * inner_dims)
                            source_slice = ((slice(None), ) * outer_dims +
                                            (source_index, ) +
                                            (slice(None), ) * inner_dims)
                            correct_params_grad[dest_slice] += gather_grad[
                                source_slice]
                        self.assertAllClose(correct_params_grad,
                                            self.evaluate(params_grad),
                                            atol=2e-6,
                                            rtol=2e-6)
Beispiel #53
0
    def _test_grad_grad(self,
                        x_shape,
                        x_dtype,
                        scale_shape,
                        scale_dtype,
                        use_gpu=True,
                        exponential_avg_factor=1.0,
                        data_format='NHWC',
                        is_training=True,
                        err_tolerance=1e-3):
        np.random.seed(1)
        x_val = np.random.random_sample(x_shape).astype(x_dtype)
        grad_y_val = np.random.random_sample(x_shape).astype(x_dtype)
        scale_val = np.random.random_sample(scale_shape).astype(scale_dtype)
        offset_val = np.random.random_sample(scale_shape).astype(scale_dtype)

        with self.cached_session(use_gpu=use_gpu) as sess:
            x = constant_op.constant(x_val, name='x')
            grad_y = constant_op.constant(grad_y_val, name='grad_y')
            scale = constant_op.constant(scale_val, name='scale')
            offset = constant_op.constant(offset_val, name='offset')
            if is_training and exponential_avg_factor == 1.0:
                pop_mean = None
                pop_var = None
            else:
                pop_mean = np.random.random_sample(scale_shape).astype(
                    scale_dtype)
                pop_var = np.random.random_sample(scale_shape).astype(
                    scale_dtype)
            y, _, _ = nn_impl.fused_batch_norm(
                x,
                scale,
                offset,
                mean=pop_mean,
                variance=pop_var,
                exponential_avg_factor=exponential_avg_factor,
                data_format=data_format,
                is_training=is_training)
            grad_x, grad_scale, grad_offset = gradients_impl.gradients(
                y, [x, scale, offset], grad_y)

            if is_training:
                epsilon = y.op.get_attr('epsilon')
                data_format = y.op.get_attr('data_format')
                grad_vals = self.evaluate([grad_x, grad_scale, grad_offset])
                grad_internal = nn_grad._BatchNormGrad(grad_y, x, scale,
                                                       pop_mean, pop_var,
                                                       epsilon, data_format)
                grad_internal_vals = self.evaluate(list(grad_internal))
                for grad_val, grad_internal_val in zip(grad_vals,
                                                       grad_internal_vals):
                    self.assertAllClose(grad_val,
                                        grad_internal_val,
                                        atol=err_tolerance)

            if x_dtype != np.float16:
                err_grad_grad_y_1 = gradient_checker.compute_gradient_error(
                    grad_y, x_shape, grad_x, x_shape)
                err_grad_grad_y_2 = gradient_checker.compute_gradient_error(
                    grad_y, x_shape, grad_scale, scale_shape)
                err_grad_grad_y_3 = gradient_checker.compute_gradient_error(
                    grad_y, x_shape, grad_offset, scale_shape)
                # In freeze mode, grad_x is not a function of x.
                if is_training:
                    err_grad_x_1 = gradient_checker.compute_gradient_error(
                        x, x_shape, grad_x, x_shape)
                err_grad_x_2 = gradient_checker.compute_gradient_error(
                    x, x_shape, grad_scale, scale_shape)

                err_grad_scale = gradient_checker.compute_gradient_error(
                    scale, scale_shape, grad_x, x_shape)
            else:
                x32 = constant_op.constant(x_val,
                                           dtype=dtypes.float32,
                                           name='x32')
                grad_y32 = constant_op.constant(grad_y_val,
                                                dtype=dtypes.float32,
                                                name='grad_y32')
                y32, _, _ = nn_impl.fused_batch_norm(
                    x32,
                    scale,
                    offset,
                    mean=pop_mean,
                    variance=pop_var,
                    exponential_avg_factor=exponential_avg_factor,
                    data_format=data_format,
                    is_training=is_training)
                grad_x32, grad_scale32, grad_offset32 = gradients_impl.gradients(
                    y32, [x32, scale, offset], grad_y32)
                err_grad_grad_y_1 = self._compute_gradient_error_float16(
                    grad_y, grad_y32, x_shape, grad_x, grad_x32, x_shape)
                err_grad_grad_y_2 = self._compute_gradient_error_float16(
                    grad_y, grad_y32, x_shape, grad_scale, grad_scale32,
                    scale_shape)
                err_grad_grad_y_3 = self._compute_gradient_error_float16(
                    grad_y, grad_y32, x_shape, grad_offset, grad_offset32,
                    scale_shape)
                # In freeze mode, grad_x is not a function of x.
                if is_training:
                    err_grad_x_1 = self._compute_gradient_error_float16(
                        x, x32, x_shape, grad_x, grad_x32, x_shape)
                err_grad_x_2 = self._compute_gradient_error_float16(
                    x, x32, x_shape, grad_scale, grad_scale32, scale_shape)

                err_grad_scale = self._compute_gradient_error_float16(
                    scale, scale, scale_shape, grad_x, grad_x32, x_shape)

        self.assertLess(err_grad_grad_y_1, err_tolerance)
        self.assertLess(err_grad_grad_y_2, err_tolerance)
        self.assertLess(err_grad_grad_y_3, err_tolerance)
        if is_training:
            self.assertLess(err_grad_x_1, err_tolerance)
        self.assertLess(err_grad_x_2, err_tolerance)
        self.assertLess(err_grad_scale, err_tolerance)
    def _train_op_fn(loss):
      """Run one training iteration."""
      if training_state_cache:
        # Cache logits only after center_bias is complete, if it's in progress.
        train_op.append(
            control_flow_ops.cond(
                center_bias_var, control_flow_ops.no_op,
                lambda: training_state_cache.insert(tree_ids, node_ids, logits))
        )

      if closed_form_grad_and_hess_fn:
        gradients, hessians = closed_form_grad_and_hess_fn(logits, labels)
      else:
        gradients = gradients_impl.gradients(loss, logits, name='Gradients')[0]
        hessians = gradients_impl.gradients(
            gradients, logits, name='Hessians')[0]

      stats_summaries_list = []
      for i, feature_ids in enumerate(feature_ids_list):
        num_buckets = bucket_size_list[i]
        summaries = [
            array_ops.squeeze(
                boosted_trees_ops.make_stats_summary(
                    node_ids=node_ids,
                    gradients=gradients,
                    hessians=hessians,
                    bucketized_features_list=[input_feature_list[f]],
                    max_splits=max_splits,
                    num_buckets=num_buckets),
                axis=0) for f in feature_ids
        ]
        stats_summaries_list.append(summaries)

      # ========= Helper methods for both in and not in memory. ==============
      def grow_tree_from_stats_summaries(stats_summaries_list,
                                         feature_ids_list):
        """Updates ensemble based on the best gains from stats summaries."""
        node_ids_per_feature = []
        gains_list = []
        thresholds_list = []
        left_node_contribs_list = []
        right_node_contribs_list = []
        all_feature_ids = []

        assert len(stats_summaries_list) == len(feature_ids_list)

        for i, feature_ids in enumerate(feature_ids_list):
          (numeric_node_ids_per_feature, numeric_gains_list,
           numeric_thresholds_list, numeric_left_node_contribs_list,
           numeric_right_node_contribs_list) = (
               boosted_trees_ops.calculate_best_gains_per_feature(
                   node_id_range=last_layer_nodes_range,
                   stats_summary_list=stats_summaries_list[i],
                   l1=tree_hparams.l1,
                   l2=tree_hparams.l2,
                   tree_complexity=tree_hparams.tree_complexity,
                   min_node_weight=tree_hparams.min_node_weight,
                   max_splits=max_splits))

          all_feature_ids += feature_ids
          node_ids_per_feature += numeric_node_ids_per_feature
          gains_list += numeric_gains_list
          thresholds_list += numeric_thresholds_list
          left_node_contribs_list += numeric_left_node_contribs_list
          right_node_contribs_list += numeric_right_node_contribs_list

        grow_op = boosted_trees_ops.update_ensemble(
            # Confirm if local_tree_ensemble or tree_ensemble should be used.
            tree_ensemble.resource_handle,
            feature_ids=all_feature_ids,
            node_ids=node_ids_per_feature,
            gains=gains_list,
            thresholds=thresholds_list,
            left_node_contribs=left_node_contribs_list,
            right_node_contribs=right_node_contribs_list,
            learning_rate=tree_hparams.learning_rate,
            max_depth=tree_hparams.max_depth,
            pruning_mode=boosted_trees_ops.PruningMode.NO_PRUNING)
        return grow_op

      def _center_bias_fn(mean_gradients, mean_hessians):
        """Updates the ensembles and cache (if needed) with logits prior."""
        continue_centering = boosted_trees_ops.center_bias(
            tree_ensemble.resource_handle,
            mean_gradients=mean_gradients,
            mean_hessians=mean_hessians,
            l1=tree_hparams.l1,
            l2=tree_hparams.l2
        )
        return center_bias_var.assign(continue_centering)

      # ========= End of helper methods. ==============

      if train_in_memory and is_single_machine:
        train_op.append(distribute_lib.increment_var(global_step))

        mean_gradients = array_ops.expand_dims(
            math_ops.reduce_mean(gradients, 0), 0)
        mean_heassians = array_ops.expand_dims(
            math_ops.reduce_mean(hessians, 0), 0)

        train_op.append(
            control_flow_ops.cond(
                center_bias_var,
                lambda: _center_bias_fn(mean_gradients, mean_heassians),
                functools.partial(grow_tree_from_stats_summaries,
                                  stats_summaries_list, feature_ids_list)))
      else:

        def center_bias_not_in_mem():
          """Accumulates the data and updates the logits bias, when ready."""
          bias_dependencies = []

          bias_accumulator = data_flow_ops.ConditionalAccumulator(
              dtype=dtypes.float32,
              # The stats consist of grads and hessians means only.
              # TODO(nponomareva): this will change for a multiclass
              shape=[2, 1],
              shared_name='bias_accumulator')

          grads_and_hess = array_ops.stack([gradients, hessians], axis=0)
          grads_and_hess = math_ops.reduce_mean(grads_and_hess, axis=1)

          apply_grad = bias_accumulator.apply_grad(grads_and_hess, stamp_token)
          bias_dependencies.append(apply_grad)

          def center_bias_from_accumulator():
            accumulated = array_ops.unstack(
                bias_accumulator.take_grad(1), axis=0)
            return _center_bias_fn(
                array_ops.expand_dims(accumulated[0], 0),
                array_ops.expand_dims(accumulated[1], 0))

          with ops.control_dependencies(bias_dependencies):
            if config.is_chief:
              center_bias_op = control_flow_ops.cond(
                  math_ops.greater_equal(bias_accumulator.num_accumulated(),
                                         n_batches_per_layer),
                  center_bias_from_accumulator,
                  control_flow_ops.no_op,
                  name='wait_until_n_batches_for_bias_accumulated')

              return center_bias_op

        def grow_not_in_mem():
          """Accumulates the data and grows a layer when ready."""

          accumulators = []
          dependencies = []
          for i, feature_ids in enumerate(feature_ids_list):
            stats_summaries = stats_summaries_list[i]
            accumulator = data_flow_ops.ConditionalAccumulator(
                dtype=dtypes.float32,
                # The stats consist of grads and hessians (the last dimension).
                shape=[len(feature_ids), max_splits, bucket_size_list[i], 2],
                shared_name='numeric_stats_summary_accumulator_' + str(i))
            accumulators.append(accumulator)

            apply_grad = accumulator.apply_grad(
                array_ops.stack(stats_summaries, axis=0), stamp_token)
            dependencies.append(apply_grad)

          def grow_tree_from_accumulated_summaries_fn():
            """Updates tree with the best layer from accumulated summaries."""
            # Take out the accumulated summaries from the accumulator and grow.
            stats_summaries_list = []

            stats_summaries_list = [
                array_ops.unstack(accumulator.take_grad(1), axis=0)
                for accumulator in accumulators
            ]

            grow_op = grow_tree_from_stats_summaries(stats_summaries_list,
                                                     feature_ids_list)
            return grow_op

          with ops.control_dependencies(dependencies):
            if config.is_chief:
              min_accumulated = math_ops.reduce_min(
                  array_ops.stack(
                      [acc.num_accumulated() for acc in accumulators]))

              grow_model = control_flow_ops.cond(
                  math_ops.greater_equal(min_accumulated, n_batches_per_layer),
                  grow_tree_from_accumulated_summaries_fn,
                  control_flow_ops.no_op,
                  name='wait_until_n_batches_accumulated')

              return grow_model

        update_model = control_flow_ops.cond(
            center_bias_var, center_bias_not_in_mem, grow_not_in_mem)
        train_op.append(update_model)
        with ops.control_dependencies([update_model]):
          increment_global = distribute_lib.increment_var(global_step)
          train_op.append(increment_global)

      return control_flow_ops.group(train_op, name='train_op')
Beispiel #55
0
    def testTimeReversedFusedRNN(self):
        with self.cached_session() as sess:
            initializer = init_ops.random_uniform_initializer(-0.01,
                                                              0.01,
                                                              seed=19890213)
            fw_cell = rnn_cell.BasicRNNCell(10)
            bw_cell = rnn_cell.BasicRNNCell(10)
            batch_size = 5
            input_size = 20
            timelen = 15
            inputs = constant_op.constant(
                np.random.randn(timelen, batch_size, input_size))

            # test bi-directional rnn
            with variable_scope.variable_scope("basic",
                                               initializer=initializer):
                unpacked_inputs = array_ops.unstack(inputs)
                outputs, fw_state, bw_state = rnn.static_bidirectional_rnn(
                    fw_cell, bw_cell, unpacked_inputs, dtype=dtypes.float64)
                packed_outputs = array_ops.stack(outputs)
                basic_vars = [
                    v for v in variables.trainable_variables()
                    if v.name.startswith("basic/")
                ]
                sess.run([variables.global_variables_initializer()])
                basic_outputs, basic_fw_state, basic_bw_state = sess.run(
                    [packed_outputs, fw_state, bw_state])
                basic_grads = sess.run(
                    gradients_impl.gradients(packed_outputs, inputs))
                basic_wgrads = sess.run(
                    gradients_impl.gradients(packed_outputs, basic_vars))

            with variable_scope.variable_scope("fused",
                                               initializer=initializer):
                fused_cell = fused_rnn_cell.FusedRNNCellAdaptor(
                    rnn_cell.BasicRNNCell(10))
                fused_bw_cell = fused_rnn_cell.TimeReversedFusedRNN(
                    fused_rnn_cell.FusedRNNCellAdaptor(
                        rnn_cell.BasicRNNCell(10)))
                fw_outputs, fw_state = fused_cell(inputs,
                                                  dtype=dtypes.float64,
                                                  scope="fw")
                bw_outputs, bw_state = fused_bw_cell(inputs,
                                                     dtype=dtypes.float64,
                                                     scope="bw")
                outputs = array_ops.concat([fw_outputs, bw_outputs], 2)
                fused_vars = [
                    v for v in variables.trainable_variables()
                    if v.name.startswith("fused/")
                ]
                sess.run([variables.global_variables_initializer()])
                fused_outputs, fused_fw_state, fused_bw_state = sess.run(
                    [outputs, fw_state, bw_state])
                fused_grads = sess.run(
                    gradients_impl.gradients(outputs, inputs))
                fused_wgrads = sess.run(
                    gradients_impl.gradients(outputs, fused_vars))

            self.assertAllClose(basic_outputs, fused_outputs)
            self.assertAllClose(basic_fw_state, fused_fw_state)
            self.assertAllClose(basic_bw_state, fused_bw_state)
            self.assertAllClose(basic_grads, fused_grads)
            for basic, fused in zip(basic_wgrads, fused_wgrads):
                self.assertAllClose(basic, fused, rtol=1e-2, atol=1e-2)
Beispiel #56
0
 def testNoIntegerGradient1(self):
     x = constant_op.constant([3.9, 4.1])
     k = math_ops.to_float(math_ops.to_int32(x))
     y = k * k
     dy_dx, = gradients_impl.gradients(y, x)
     self.assertIsNone(dy_dx)
Beispiel #57
0
 def inner_nesting_fn():
     return gradients_impl.gradients(cond_outer, [x, y])
Beispiel #58
0
 def testNoIntegerGradient2(self):
     k = constant_op.constant([3, 4])
     x = math_ops.to_float(k)
     y = x * x
     dy_dk, = gradients_impl.gradients(y, k)
     self.assertIsNone(dy_dk)
Beispiel #59
0
 def step(c):
     x = constant_op.constant(42.)
     y = comm_fn(x) * c
     return gradients_impl.gradients(y, [x])[0]
Beispiel #60
0
 def testIntegerIdentityGradient(self):
     x = constant_op.constant(3)
     dx_dx, = gradients_impl.gradients(x, x)
     with self.cached_session() as sess:
         self.assertAllClose(1, sess.run(dx_dx))