def testNanFromGradsDontPropagate(self): """Test that update with NaN gradients does not cause NaN in results.""" if tf1.control_flow_v2_enabled(): self.skipTest('b/138796859') if tf.executing_eagerly(): return def _nan_log_prob_with_nan_gradient(x): return np.nan * tf.reduce_sum(x) initial_x = tf.linspace(0.01, 5, 10) hmc = tfp.mcmc.HamiltonianMonteCarlo( target_log_prob_fn=_nan_log_prob_with_nan_gradient, step_size=2., num_leapfrog_steps=5) updated_x, kernel_results = hmc.one_step( current_state=initial_x, previous_kernel_results=hmc.bootstrap_results(initial_x), seed=test_util.test_seed()) initial_x_, updated_x_, log_accept_ratio_ = self.evaluate( [initial_x, updated_x, kernel_results.log_accept_ratio]) acceptance_probs = np.exp(np.minimum(log_accept_ratio_, 0.)) logging.vlog(1, 'initial_x = {}'.format(initial_x_)) logging.vlog(1, 'updated_x = {}'.format(updated_x_)) logging.vlog(1, 'log_accept_ratio = {}'.format(log_accept_ratio_)) self.assertAllEqual(initial_x_, updated_x_) self.assertEqual(acceptance_probs, 0.) self.assertAllEqual([True], [ g is None for g in tf.gradients( ys=kernel_results.proposed_results.grads_target_log_prob, xs=initial_x) ]) self.assertAllFinite( self.evaluate(tf.gradients(ys=updated_x, xs=initial_x)[0]))
def testGradientsSecondOrder(self): f = lambda x: 2 * (x**2) x = ed.RandomVariable(tfp.distributions.Normal(0.0, 1.0)) y = f(x) if tf.executing_eagerly(): df = tfe.gradients_function(f) d2f = tfe.gradients_function(lambda x: df(x)[0]) (z, ) = d2f(x) else: (z, ) = tf.gradients(y, x) (z, ) = tf.gradients(z, x) self.assertEqual(self.evaluate(z), 4.0)
def value_and_gradient(f, xs, output_gradients=None, use_gradient_tape=False, unconnected_gradients=None, name=None): """Computes `f(*xs)` and its gradients wrt to `*xs`. Args: f: Python `callable` to be differentiated. If `f` returns a scalar, this scalar will be differentiated. If `f` returns a tensor or list of tensors, by default a scalar will be computed by adding all their values to produce a single scalar. If desired, the tensors can be elementwise multiplied by the tensors passed as the `dy` keyword argument to the returned gradient function. xs: Python list of parameters of `f` for which to differentiate. (Can also be single `Tensor`.) output_gradients: A `Tensor` or list of `Tensor`s the same size as the result `ys = f(*xs)` and holding the gradients computed for each `y` in `ys`. This argument is forwarded to the underlying gradient implementation (i.e., either the `grad_ys` argument of `tf.gradients` or the `output_gradients` argument of `tf.GradientTape.gradient`). use_gradient_tape: Python `bool` indicating that `tf.GradientTape` should be used regardless of `tf.executing_eagerly()` status. Default value: `False`. unconnected_gradients: An enum `tf.UnconnectedGradients` which specifies the gradient value returned when the given input tensors are unconnected. Default value: `None`, which maps to `tf.UnconnectedGradients.NONE`. name: Python `str` name prefixed to ops created by this function. Default value: `None` (i.e., `'value_and_gradient'`). Returns: A tuple of two elements. The first one is a `Tensor` representing the value of the function at `xs` and the second one is either a `Tensor` or a list of `Tensor`s representing the gradient of `f(*xs)` wrt `xs`. y: `y = f(*xs)`. dydx: Gradient of `y` wrt each of `xs`. """ unconnected_gradients = unconnected_gradients or tf.UnconnectedGradients.NONE xs, is_xs_list_like = _prepare_args(xs) with tf.name_scope(name or "value_and_gradient"): if tf.executing_eagerly() or use_gradient_tape: with tf.GradientTape() as tape: for x in xs: tape.watch(x) y = f(*xs) grad = tape.gradient(y, xs, output_gradients=output_gradients, unconnected_gradients=unconnected_gradients) else: y = f(*xs) grad = tf.gradients(ys=y, xs=xs, grad_ys=output_gradients, unconnected_gradients=unconnected_gradients) if is_xs_list_like: return y, grad else: return y, grad[0]
def render_deepdream(t_obj, img0=img_noise, iter_n=10, step=1.5, octave_n=4, octave_scale=1.4): t_score = tf.reduce_mean(t_obj) #defining optimization objective t_grad = tf.gradients(t_score, t_input)[0] #split the image into a number of octaves img = img0 octaves = [] for _ in range(octave_n - 1): hw = img.shape[:2] lo = resize(img, np.int32(np.float32(hw) / octave_scale)) hi = img - resize(low, hw) img = lo octaves.append(hi) #generate details octave by octave for octave in range(octave_n): if octave > 0: hi = octaves[-octave] img = resize(img, hi.shape[:2]) + hi for _ in range(iter_n): g = calc_grad_tiled(img, t_grad) img += g * (step / (np.abs(g).mean() + 1e-7)) #output deep dreamed image showarray(img / 255.0)
def test_valid_gradients(self): """Tests none of the gradients is nan.""" # In this example, `x[0]` and `x[1]` are both less than or equal to # `x_data[0]`. `x[-2]` and `x[-1]` are both greater than or equal to # `x_data[-1]`. They are set up this way to test none of the tf.where # branches of the implementation have any nan. An unselected nan could still # propagate through gradient calculation with the end result being nan. x = [[-10.0, -1.0, 1.0, 3.0, 6.0, 7.0], [8.0, 15.0, 18.0, 25.0, 30.0, 35.0]] x_data = [[-1.0, 2.0, 6.0], [8.0, 18.0, 30.0]] def _value_helper_fn(y_data): """A helper function that returns sum of squared interplated values.""" interpolated_values = tff.math.interpolation.linear.interpolate( x, x_data, y_data, dtype=tf.float64) return tf.reduce_sum(tf.math.square(interpolated_values)) y_data = tf.convert_to_tensor([[10.0, -1.0, -5.0], [7.0, 9.0, 20.0]], dtype=tf.float64) if tf.executing_eagerly(): with tf.GradientTape(watch_accessed_variables=False) as tape: tape.watch(y_data) value = _value_helper_fn(y_data=y_data) gradients = tape.gradient(value, y_data) else: value = _value_helper_fn(y_data=y_data) gradients = tf.gradients(value, y_data)[0] gradients = tf.convert_to_tensor(gradients) self.assertFalse( self.evaluate(tf.reduce_any(tf.math.is_nan(gradients))))
def testHandlesNanFromKinetic(self): if tf.executing_eagerly(): return x = self.dtype([1, np.inf, -np.inf, np.nan]) momentums, proposed_momentums = [[np.reshape(x, [-1, 1])] for x in np.meshgrid(x, x)] num_chains = len(momentums[0]) momentums = [tf.convert_to_tensor(momentums[0])] proposed_momentums = [tf.convert_to_tensor(proposed_momentums[0])] log_acceptance_correction = _compute_log_acceptance_correction( momentums, proposed_momentums, independent_chain_ndims=1) grads = tf.gradients(ys=log_acceptance_correction, xs=momentums) [actual_log_acceptance_correction, grads_] = self.evaluate([log_acceptance_correction, grads]) # Ensure log_acceptance_correction is `inf` (note: that's positive inf) in # weird cases and finite otherwise. expected_log_acceptance_correction = -(self.dtype([0] + [np.inf] * (num_chains - 1))) self.assertAllEqual(expected_log_acceptance_correction, actual_log_acceptance_correction) # Ensure gradient is finite. g = grads_[0].reshape([len(x), len(x)])[:, 0] self.assertAllEqual(np.ones_like(g, dtype=np.bool), np.isfinite(g)) # The remaining gradients are nan because the momentum was itself nan or # inf. g = grads_[0].reshape([len(x), len(x)])[:, 1:] self.assertAllEqual(np.ones_like(g, dtype=np.bool), np.isnan(g))
def testGradientsSecondOrder(self): x = ed.RandomVariable(tfp.distributions.Normal(0.0, 1.0)) def f(x): return 2 * (x ** 2) if tf.executing_eagerly(): with tf.GradientTape() as tape2: tape2.watch(x.value) with tf.GradientTape() as tape: tape.watch(x.value) y = f(x) z = tape.gradient(y, [x.value])[0] z = tape2.gradient(z, [x.value])[0] else: y = f(x) (z,) = tf.gradients(y, x) (z,) = tf.gradients(z, x) self.assertEqual(self.evaluate(z), 4.0)
def compute_gradients(self, loss, tape=None): """This is to be used in Eager mode when a GradientTape is available.""" if tf.executing_eagerly(): assert tape is not None gradients = tape.gradient(loss, self.variables) else: gradients = tf.gradients(loss, self.variables) return gradients
def testGradientsFirstOrder(self): f = lambda x: 2. * x x = ed.RandomVariable(tfp.distributions.Normal(0., 1.)) y = f(x) if tf.executing_eagerly(): df = tfe.gradients_function(f) (z, ) = df(x) else: (z, ) = tf.gradients(y, x) self.assertEqual(self.evaluate(z), 2.)
def get_exec_time_timeline(model, batch_size, get_grads=False, num_runs=1, return_timeline=False): print("get_exec_time_timeline", model.__class__.__name__) run_opts = tf1.RunOptions(trace_level=tf1.RunOptions.FULL_TRACE) input_shapes, output_shapes = get_shapes(model, batch_size) concrete_function = get_concrete_function(model, input_shapes) # input_names = [f"input_random_normal_{i}" for i in range(len(input_shapes))] # output_names = [f"output_random_normal_{i}" for i in range(len(output_shapes))] # inputs = [tf.random.normal(shp, name=name) for name, shp in zip(input_names, input_shapes)] # outputs = [tf.random.normal(shp, name=name) for name, shp in zip(output_names, output_shapes)] times = [] for run in range(num_runs + 1): # with tf1.Session(config=config) as sess: with tf1.Session() as sess: run_meta = tf1.RunMetadata() sess.run(tf1.global_variables_initializer()) inputs = [tf.random.normal(shp) for shp in input_shapes] outputs = [tf.random.normal(shp) for shp in output_shapes] out = concrete_function(*inputs) if not get_grads: sess.run(out, options=run_opts, run_metadata=run_meta) t1 = timeline.Timeline(run_meta.step_stats) ctf = t1.generate_chrome_trace_format() else: grads = tf.gradients(out, inputs, grad_ys=outputs) run_meta = tf1.RunMetadata() sess.run(grads, options=run_opts, run_metadata=run_meta) t1 = timeline.Timeline(run_meta.step_stats) ctf = t1.generate_chrome_trace_format() if return_timeline: return ctf # for i in inputs: # del i # del inputs # for o in outputs: # del o # del outputs time = convert_string_to_time(ctf) times.append(time) # for handle in inputs: # tf1.delete_session_tensor(handle) # for handle in output_names: # tf1.delete_session_tensor(handle) if np.std(times) <= np.std(times[1:]): return np.average(times), np.std(times) # Filter first run return np.average(times[1:]), np.std(times[1:])
def gradients(func_or_y, xs, output_gradients=None, use_gradient_tape=False, unconnected_gradients=None, name=None): """Computes the gradients of `func_or_y` wrt to `*xs`. Args: func_or_y: Either a `Tensor` conencted to the input `x` or a Python callable accepting one `Tensor` of shape of `x` and returning a `Tensor` of any shape. The function whose gradient is to be computed. If eagerly executing, can only be a callable, i.e., one should not supply a Tensor in eager mode. xs: Python list of parameters of `f` for which to differentiate. (Can also be single `Tensor`.) output_gradients: A `Tensor` or list of `Tensor`s the same size as the result `ys = f(*xs)` and holding the gradients computed for each `y` in `ys`. This argument is forwarded to the underlying gradient implementation (i.e., either the `grad_ys` argument of `tf.gradients` or the `output_gradients` argument of `tf.GradientTape.gradient`). Default value: `None` which maps to a ones-like `Tensor` of `ys`. use_gradient_tape: Python `bool` indicating that `tf.GradientTape` should be used regardless of `tf.executing_eagerly()` status. Default value: `False`. unconnected_gradients: An enum `tf.UnconnectedGradients` which specifies the gradient value returned when the given input tensors are unconnected. Default value: `None`, which maps to `tf.UnconnectedGradients.NONE`. name: Python `str` name prefixed to ops created by this function. Default value: `None` (i.e., 'gradients'). Returns: A `Tensor` with the gradient of `y` wrt each of `xs` or a list of `Tensor`s if `xs` is a list. """ unconnected_gradients = unconnected_gradients or tf.UnconnectedGradients.NONE f = _prepare_func(func_or_y) with tf.name_scope(name or "gradients"): xs, is_xs_list_like = _prepare_args(xs) if not tf.executing_eagerly() and not use_gradient_tape: y = f(*xs) grad = tf.gradients(y, xs, grad_ys=output_gradients, unconnected_gradients=unconnected_gradients) else: if not callable(func_or_y): raise ValueError("`func_or_y` should be a callable in eager mode or " "when `tf.GradientTape` is used.") with tf.GradientTape() as tape: for x in xs: tape.watch(x) y = f(*xs) grad = tape.gradient(y, xs, output_gradients=output_gradients, unconnected_gradients=unconnected_gradients) if is_xs_list_like: return grad else: return grad[0]
def test_interpolation_differentiable(self): dtype = tf.float64 interval_times = tf.constant([0.25, 0.5, 1.0, 2.0, 3.0], dtype=dtype) knot_1y = tf.constant([0.052], dtype=dtype) interval_values = tf.concat([ tf.constant([0.05, 0.051], dtype=dtype), knot_1y, tf.constant([0.053, 0.055], dtype=dtype) ], axis=0) test_time = tf.constant([1.1, 2.7], dtype=dtype) interpolated, _ = monotone_convex.interpolate(test_time, interval_values, interval_times) gradient_1y = self.evaluate( tf.convert_to_tensor(tf.gradients(interpolated[0], knot_1y)[0])) gradient_zero = self.evaluate( tf.convert_to_tensor(tf.gradients(interpolated[1], knot_1y)[0])) self.assertAlmostEqual(gradient_1y[0], 0.42) self.assertAlmostEqual(gradient_zero[0], 0.0)
def grad_fn(temperature): """Returns gradient of log-likelihood WRT a logits-scaling temperature.""" temperature *= tf.ones([]) if len(logits.shape) == 1: dist = tfp.distributions.Bernoulli(logits=logits / temperature) elif len(logits.shape) == 2: dist = tfp.distributions.Categorical(logits=logits / temperature) nll = -dist.log_prob(labels) nll = tf.reduce_sum(nll, axis=0) grad, = tf.gradients(nll, [temperature]) return grad
def test_diffs_differentiable(self): """Tests that the diffs op is differentiable.""" x = tf.constant(2.0) xv = tf.stack([x, x * x, x * x * x], axis=0) # Produces [x, x^2 - x, x^3 - x^2] dxv = self.evaluate(math.diff(xv)) np.testing.assert_array_equal(dxv, [2., 2., 4.]) grad = self.evaluate(tf.gradients(math.diff(xv), x)[0]) # Note that TF gradients adds up the components of the jacobian. # The sum of [1, 2x-1, 3x^2-2x] at x = 2 is 12. self.assertEqual(grad, 12.0)
def testGradientsFirstOrder(self): x = ed.RandomVariable(tfp.distributions.Normal(0., 1.)) def f(x): return 2. * x if tf.executing_eagerly(): with tf.GradientTape() as tape: tape.watch(x.value) y = f(x) z = tape.gradient(y, [x.value])[0] else: y = f(x) (z,) = tf.gradients(y, x) self.assertEqual(self.evaluate(z), 2.)
def value_and_gradient(f, xs, output_gradients=None, use_gradient_tape=False, name=None): """Computes `f(*xs)` and its gradients wrt to `*xs`. Args: f: Python `callable` to be differentiated. If `f` returns a scalar, this scalar will be differentiated. If `f` returns a tensor or list of tensors, by default a scalar will be computed by adding all their values to produce a single scalar. If desired, the tensors can be elementwise multiplied by the tensors passed as the `dy` keyword argument to the returned gradient function. xs: Python list of parameters of `f` for which to differentiate. (Can also be single `Tensor`.) output_gradients: A `Tensor` or list of `Tensor`s the same size as the result `ys = f(*xs)` and holding the gradients computed for each `y` in `ys`. This argument is forwarded to the underlying gradient implementation (i.e., either the `grad_ys` argument of `tf.gradients` or the `output_gradients` argument of `tf.GradientTape.gradient`). use_gradient_tape: Python `bool` indicating that `tf.GradientTape` should be used regardless of `tf.executing_eagerly()` status. Default value: `False`. name: Python `str` name prefixed to ops created by this function. Default value: `None` (i.e., `'value_and_gradient'`). Returns: y: `y = f(*xs)`. dydx: Gradient of `y` wrt each of `xs`. """ with tf.name_scope(name or 'value_and_gradient'): is_xs_list_like = isinstance(xs, (tuple, list)) if not is_xs_list_like: xs = [xs] xs = [ tf.convert_to_tensor(x, dtype_hint=tf.float32, name='x{}'.format(i)) for i, x in enumerate(xs) ] if tf.executing_eagerly() or use_gradient_tape: with tf.GradientTape(watch_accessed_variables=False) as tape: for x in xs: tape.watch(x) y = f(*xs) dydx = tape.gradient(y, xs, output_gradients=output_gradients) else: y = f(*xs) dydx = tf.gradients(ys=y, xs=xs, grad_ys=output_gradients) if not is_xs_list_like: dydx = dydx[0] return y, dydx
def loop_body(j): """Loop function to compute gradients of the each direction.""" # Gradient along direction `j`. res = tf.gradients(ys=y_[..., j], xs=x)[0] # pylint: disable=cell-var-from-loop if res is None: # Return zero, if the gradient is `None`. res = tf.zeros(tf.concat([sample_shape, [1]], -1), dtype=x.dtype) # pylint: disable=cell-var-from-loop else: # Reshape `event_shape` to 1D res = tf.reshape(res, tf.concat([sample_shape, [-1]], -1)) # Add artificial dimension for the case of zero shape input tensor res = res[tf.newaxis, ..., j] return res # pylint: disable=cell-var-from-loop
def test_gradients_and_propagation_of_nan_in_x(self): # If x contains NaN, this should propagate through to y, and not mess up the # gradients associated with finite members of x. # In fact, even NaN members of x result in finite (zero) gradients. x_min = 0. x_max = 1. dtype = np.float32 num_pts = 4 implied_x_ref = np.linspace(x_min, x_max, num_pts, dtype=dtype) y_ref = 2 * implied_x_ref x_ = np.array([0., 0.1, np.nan, 0.4, 1.]).astype(dtype) y_expected = 2 * x_ x = tf.constant(x_) y = tfp.math.batch_interp_regular_1d_grid(x, x_min, x_max, y_ref) y_ = self.evaluate(y) self.assertAllClose(y_, y_expected, atol=0, rtol=1e-6) if not tf.executing_eagerly(): dy_dx_ = self.evaluate(tf.gradients(ys=y, xs=x)[0]) self.assertAllClose([2., 2., 0., 2., 2.], dy_dx_)
def fwd_gradient(func, x, grad_x=None, use_gradient_tape=False): """Computes forward mode gradient. Implementation based on suggestions in [this thread](https://github.com/tensorflow/tensorflow/issues/19361). TensorFlow computes gradients using the reverse mode automatic differentiation which is suitable for typical machine learning situations where one has a scalar loss function that one wants to differentiate with respect to the parameters. In some cases, one needs to be able to compute directional derivatives of non-scalar functions. Suppose F is a function from R^n to R^m and let u be a fixed vector in R^n, w a fixed vector in R^m and x a variable taking values in R^n. Let J(F) denote the jacobian matrix of F of shape [m, n] (i.e. J(F)[i, j] = dF_i / dx_j). Then the default gradients function in TF computes the expression w^T.J(F) (i.e. Sum[w_i dF_i / dx_j, 1 <= i <= m]). On the other hand, one also often needs to compute the directional derivative J(F).u (i.e. Sum[u_j dF_i / dx_j, 1 <= j <= n]). Unfortunately, TensorFlow has no native support for accumulating this. Providing first class support for forward mode differentiation requires some significant changes in the core architecture of TF (including writing a directional derivative for each op). The following function sidesteps this by using two passes of reverse mode differentiation. Mathematically, the idea is simple. If F: R^n -> R^m, then w^T.J(F) seen as a function of w is a function from R^m to R^n (because w is in R^m, and w^T.J(F) is in R^n). Hence a reverse mode differentiation with respect to w should produce J(F).u. This function provides only a small subset of the flexibility of the tf.gradients function. This may be extended in the future. ### Example Following example demonstrates the usage and the difference between this op and the standard `tf.gradients` ```python t = tf.range(1, 3, dtype=tf.float32) # Shape [2] def fn(t): return tf.stack([t, t ** 2, t ** 3], axis=0) # Shape [3, 2] # Produces shape [3, 2] with values [[1, 1], [2, 4], [3, 12]] fwd_grad_y = fwd_gradient(fn, t) # Produces shape [2] with values [6, 17]. bck_grad_y = tf.gradients(y, t)[0] ``` Args: func: A Python callable accepting one `Tensor` of shape of `x` and returning a `Tensor` of any shape. The function whose gradient is to be computed. x: A `Tensor` with respect to which the gradient is to be computed. grad_x: A `Tensor` of the same shape as `x`. The direction along which the directional derivative is to be computed. use_gradient_tape: Optional Python bool. Whether to use gradient tape even when eager mode is not turned on. Returns: A `Tensor` of the same shape as `func(x)`. """ if not tf.executing_eagerly() and not use_gradient_tape: y = func(x) w = tf.zeros_like(y) g = tf.gradients(y, x, grad_ys=w) return tf.gradients(g, w, grad_ys=grad_x)[0] with tf.GradientTape() as outer_tape: with tf.GradientTape() as inner_tape: inner_tape.watch(x) y = func(x) w = tf.zeros_like(y) outer_tape.watch(w) g = inner_tape.gradient(y, x, output_gradients=w) return outer_tape.gradient(g, w, output_gradients=grad_x)
def _gradient_old(f, xs, grad_ys): assert not tf.executing_eagerly() y = f() return y, tf.gradients(y, xs, grad_ys=grad_ys)
def fwd_gradient(func_or_y, x, input_gradients=None, use_gradient_tape=False, unconnected_gradients=None, name=None): """Computes forward mode gradient. Implementation based on suggestions in [this thread](https://github.com/tensorflow/tensorflow/issues/19361). TensorFlow computes gradients using the reverse mode automatic differentiation which is suitable for typical machine learning situations where one has a scalar loss function that one wants to differentiate with respect to the parameters. In some cases, one needs to be able to compute directional derivatives of non-scalar functions. Suppose F is a function from R^n to R^m and let u be a fixed vector in R^n, w a fixed vector in R^m and x a variable taking values in R^n. Let J(F) denote the jacobian matrix of F of shape [m, n] (i.e. J(F)[i, j] = dF_i / dx_j). Then the default gradients function in TF computes the expression w^T.J(F) (i.e. Sum[w_i dF_i / dx_j, 1 <= i <= m]). On the other hand, one also often needs to compute the directional derivative J(F).u (i.e. Sum[u_j dF_i / dx_j, 1 <= j <= n]). Unfortunately, TensorFlow has no native support for accumulating this. Providing first class support for forward mode differentiation requires some significant changes in the core architecture of TF (including writing a directional derivative for each op). The following function sidesteps this by using two passes of reverse mode differentiation. Mathematically, the idea is simple. If F: R^n -> R^m, then w^T.J(F) seen as a function of w is a function from R^m to R^n (because w is in R^m, and w^T.J(F) is in R^n). Hence a reverse mode differentiation with respect to w should produce J(F).u. This function provides only a small subset of the flexibility of the tf.gradients function. This may be extended in the future. #### Example Following example demonstrates the usage and the difference between this op and the standard `tf.gradients` ```python t = tf.range(1, 3, dtype=tf.float32) # Shape [2] def fn(t): return tf.stack([t, t ** 2, t ** 3], axis=0) # Shape [3, 2] # Produces shape [3, 2] with values [[1, 1], [2, 4], [3, 12]] fwd_grad_y = fwd_gradient(fn, t) # Produces shape [2] with values [6, 17]. bck_grad_y = tf.gradients(y, t)[0] ``` Args: func_or_y: Either a `Tensor` conencted to the input `x` or a Python callable accepting one `Tensor` of shape of `x` and returning a `Tensor` of any shape. The function whose gradient is to be computed. If eagerly executing, can only be a callable, i.e., one should not supply a Tensor in eager mode. x: A `Tensor` with respect to which the gradient is to be computed. input_gradients: A `Tensor` of the same shape as `x`. The direction along which the directional derivative is to be computed. Default value: `None` which maps to a ones-like `Tensor` of `x`. use_gradient_tape: Optional Python bool. Whether to use gradient tape even when eager mode is not turned on. Defaule value: `False`. unconnected_gradients: An enum `tf.UnconnectedGradients` which specifies the gradient value returned when the given input tensors are unconnected. Default value: `None`, which maps to `tf.UnconnectedGradients.NONE`. name: Python `str` name prefixed to ops created by this function. Default value: `None` (i.e., 'gradients'). Returns: A `Tensor` of the same shape as `func(x)`. Raises: ValueError: If `func_or_y` is not a callable and the output is eagerly executed or when the `tf.GradientTape` is used. """ unconnected_gradients = unconnected_gradients or tf.UnconnectedGradients.NONE with tf.name_scope(name or "gradients"): f = _prepare_func(func_or_y) if not tf.executing_eagerly() and not use_gradient_tape: y = f(x) w = tf.ones_like(y) g = tf.gradients(y, x, grad_ys=w, unconnected_gradients=unconnected_gradients) return tf.gradients(g, w, grad_ys=input_gradients, unconnected_gradients=unconnected_gradients)[0] if not callable(func_or_y): raise ValueError("`func_or_y` should be a callable in eager mode or when " "`tf.GradientTape` is used.") with tf.GradientTape() as outer_tape: with tf.GradientTape() as inner_tape: inner_tape.watch(x) y = f(x) w = tf.ones_like(y) outer_tape.watch(w) g = inner_tape.gradient(y, x, output_gradients=w, unconnected_gradients=unconnected_gradients) return outer_tape.gradient(g, w, output_gradients=input_gradients, unconnected_gradients=unconnected_gradients)
def _grad_and_hessian_loss_fn(x): loss = _neg_log_likelihood(x) grad_loss = tf.gradients(ys=loss, xs=[x])[0] hessian_loss = tf.hessians(ys=loss, xs=[x])[0] hessian_chol = tf.linalg.cholesky(hessian_loss) return grad_loss, hessian_chol, tf.ones_like(grad_loss)
def testPreconditionerComputedCorrectly(self): """Test that SGLD step is computed correctly for a 3D Gaussian energy.""" if tf.executing_eagerly(): return with self.cached_session(): dtype = np.float32 # Target function is the energy function of normal distribution true_mean = dtype([0, 0, 0]) true_cov = dtype([[1, 0.25, 0.25], [0.25, 1, 0.25], [0.25, 0.25, 1]]) # Target distribution is defined through the Cholesky decomposition chol = tf.linalg.cholesky(true_cov) target = tfd.MultivariateNormalTriL(loc=true_mean, scale_tril=chol) var_1 = tf.Variable(name='var_1', initial_value=[1., 1.]) var_2 = tf.Variable(name='var_2', initial_value=[1.]) var = [var_1, var_2] # Set up the learning rate and the optimizer learning_rate = .5 optimizer_kernel = tfp.optimizer.StochasticGradientLangevinDynamics( learning_rate=learning_rate, burnin=1) # Target function def target_fn(x, y): # Stack the input tensors together z = tf.concat([x, y], axis=-1) - true_mean return -target.log_prob(z) grads = tf.gradients(ys=target_fn(*var), xs=var) # Update value of `var` with one iteration of the SGLD (without the # normal perturbation, since `burnin > 0`) step = optimizer_kernel.apply_gradients(zip(grads, var)) # True theoretical value of `var` after one iteration decay_tensor = tf.cast(optimizer_kernel._decay_tensor, var[0].dtype) diagonal_bias = tf.cast(optimizer_kernel._diagonal_bias, var[0].dtype) learning_rate = tf.cast(optimizer_kernel._learning_rate, var[0].dtype) velocity = [(decay_tensor * tf.ones_like(v) + (1 - decay_tensor) * tf.square(g)) for v, g in zip(var, grads)] preconditioner = [ tf.math.rsqrt(vel + diagonal_bias) for vel in velocity ] # Compute second order gradients _, grad_grads = diag_jacobian(xs=var, ys=grads) # Compute gradient of the preconditioner (compute the gradient manually) preconditioner_grads = [ -(g * g_g * (1. - decay_tensor) * p**3.) for g, g_g, p in zip(grads, grad_grads, preconditioner) ] # True theoretical value of `var` after one iteration var_true = [ v - learning_rate * 0.5 * (p * g - p_g) for v, p, g, p_g in zip(var, preconditioner, grads, preconditioner_grads) ] self.evaluate(tf1.global_variables_initializer()) var_true_ = self.evaluate(var_true) self.evaluate(step) var_ = self.evaluate(var) # new `var` after one SGLD step self.assertAllClose(var_true_, var_, atol=0.001, rtol=0.001)
y_conv = MnistStudent(x, scope = 'student') y_conv_student = tf2.nn.softmax(y_conv/temperature) y_conv_student_actual = tf2.nn.softmax(y_conv) cross_entropy_teacher, accuracy_teacher = loss(y_conv_teacher,y_, temperature = temperature) student_loss1, accuracy_student = loss(y_conv_student_actual,y_, temperature = temperature) student_loss2 = tf2.reduce_mean(- tf2.reduce_sum(y_conv_teacher * tf2.log(y_conv_student), reduction_indices=1)) cross_entropy_student=student_student_loss2 model_vars = tf2.trainable_variables() var_teacher = [var for var in model_vars if 'teacher' in var.name] var_student = [var for var in model_vars if 'student' in var.name] grad_teacher = tf2.gradients(cross_entropy_teacher,var_teacher) grad_student = tf2.gradients(cross_entropy_student,var_student) l_rate = tf2.placeholder(shape=[],dtype = tf2.float32) trainer = tf2.train.RMSPropOptimizer(learning_rate = l_rate) trainer1 = tf2.train.GradientDescentOptimizer(0.1) train_step_teacher = trainer.apply_gradients(zip(grad_teacher,var_teacher)) train_step_student = trainer1.apply_gradients(zip(grad_student,var_student)) sess = tf2.InteractiveSession() sess.run(tf2.global_variables_initializer()) saver1 = tf2.train.Saver(var_teacher) saver2 = tf2.train.Saver(var_student)