Beispiel #1
0
 def testGradientFunction(self):
   # Input to tf.py_func is necessary, otherwise get_gradient_function()
   # returns None per default.
   a = constant_op.constant(0)
   x, = script_ops.py_func(lambda a: 0, [a], [dtypes.int64])
   y, = script_ops.py_func(lambda a: 0, [a], [dtypes.int64], stateful=False)
   self.assertEqual(None, ops.get_gradient_function(x.op))
   self.assertEqual(None, ops.get_gradient_function(y.op))
Beispiel #2
0
 def testGradientFunction(self):
     # Input to tf.py_func is necessary, otherwise get_gradient_function()
     # returns None per default.
     a = tf.constant(0)
     x, = tf.py_func(lambda a: 0, [a], [tf.int64])
     y, = tf.py_func(lambda a: 0, [a], [tf.int64], stateful=False)
     self.assertEqual(None, ops.get_gradient_function(x.op))
     self.assertEqual(None, ops.get_gradient_function(y.op))
 def testGradientFunction(self):
   # Input to tf.compat.v1.py_func is necessary,
   # otherwise get_gradient_function() returns None per default.
   a = constant_op.constant(0)
   x, = script_ops.py_func(lambda a: 0, [a], [dtypes.int64])
   y, = script_ops.py_func(lambda a: 0, [a], [dtypes.int64], stateful=False)
   self.assertEqual(None, ops.get_gradient_function(x.op))
   self.assertEqual(None, ops.get_gradient_function(y.op))
def get_grad_results(max_val, pointwise_op, in_size):
    """
    Construct and run a Tensorflow graph to compute a backprop pointwise op.

    Will create an input tensor of the required size filled with values -n, -n+1,
    ..., 0, 1, ..., n-1, n and use these to compute the pointwise op.
    Then, create another tensor with the same values to use as the errors
    for back-propagation.
    Returns the computed values in a numpy array.
    """
    with tf.Graph().as_default():
        min_val = -max_val if in_size % 2 == 0 else -max_val - 1
        input_vals = helpers.get_signed_tensor_data(in_size,
                                                    max_val=max_val,
                                                    min_val=min_val)
        inp_tensor = tf.constant(input_vals, dtype=np.float64)

        pointwise_output = pointwise_op(inp_tensor, name='pointwise')

        tf_op = tf.get_default_graph().get_operation_by_name('pointwise')
        grad_fn = get_gradient_function(tf_op)

        output_size = in_size
        error_vals = helpers.get_signed_tensor_data(output_size,
                                                    max_val=max_val,
                                                    min_val=min_val)
        error_tensor = tf.constant(error_vals, dtype=np.float64)

        output = grad_fn(tf_op, error_tensor)

        with tf.Session() as sess:
            init = tf.global_variables_initializer()
            sess.run(init)
            sess.graph.finalize()
            return sess.run(output)
Beispiel #5
0
def find_non_differentiable(inputs, outputs):
    """
    Searches through a TensorFlow graph to find non-differentiable elements
    between ``inputs`` and ``outputs`` (elements that would prevent us from
    computing ``d_outputs / d_inputs``.

    Parameters
    ----------
    inputs : list of ``tf.Tensor``
        Input tensors
    outputs : list of ``tf.Tensor``
        Output tensors
    """

    for o in outputs:
        if o in inputs:
            continue
        else:
            try:
                grad = get_gradient_function(o.op)

                if grad is None and len(o.op.inputs) > 0:
                    # note: technically we're not sure that this op is
                    # on the path to inputs. we could wait and propagate this
                    # until we find inputs, but that can take a long time for
                    # large graphs. it seems more useful to fail quickly, and
                    # risk some false positives
                    raise LookupError
                find_non_differentiable(inputs, o.op.inputs)
            except LookupError:
                raise SimulationError(
                    "Graph contains non-differentiable "
                    "elements: %s" % o.op)
Beispiel #6
0
 def testNonExistentOverride(self):
   g = ops.Graph()
   x = an_op(g)
   with g.gradient_override_map({"copy": "unknown_override"}):
     y = copy_op(x)
   with self.assertRaisesRegexp(LookupError, "unknown_override"):
     fn = ops.get_gradient_function(y.op)
Beispiel #7
0
 def testNonExistentOverride(self):
     g = ops.Graph()
     x = an_op(g)
     with g.gradient_override_map({"copy": "unknown_override"}):
         y = copy_op(x)
     with self.assertRaisesRegexp(LookupError, "unknown_override"):
         fn = ops.get_gradient_function(y.op)
Beispiel #8
0
 def testOverrideGradients(self):
     g = ops.Graph()
     x = an_op(g)
     with g.gradient_override_map({"copy": "copy_override"}):
         y = copy_op(x)
     fn = ops.get_gradient_function(y.op)
     self.assertEqual(_CopyOverrideGrad, fn)
Beispiel #9
0
def go():
    dense = tf.Variable([[0, 0, 10, 1, 0, 0], [0, 0, -2, 3, 0, 0]],
                        dtype=tf.float32)
    sm1 = tf.nn.softmax(dense)

    denseReplacing0WithNeg10 = tf.where(
        dense > 0.0, dense,
        tf.ones(tf.shape(dense), tf.float32) * (-10.0))
    sm2 = tf.nn.softmax(denseReplacing0WithNeg10)

    nz_indices = tf.where(tf.not_equal(dense, tf.constant(0,
                                                          dtype=tf.float32)))
    nz_values = tf.gather_nd(dense, nz_indices)
    sparse = tf.SparseTensor(nz_indices, nz_values, dense.get_shape())
    sm3 = tf.sparse_softmax(sparse)
    dm3a = tf.sparse_to_dense(sm3.indices, sm3.get_shape(), sm3.values)
    dm3b = tf.scatter_nd(sm3.indices, sm3.values, dense.get_shape())

    session = tf.Session()
    session.run(tf.global_variables_initializer())
    from tensorflow.python.framework import ops
    for v in nz_indices, nz_values, sparse, sm3, dm3a, dm3b:
        print 'gradient of op', v, ops.get_gradient_function(v.op)

    print 'dense sm - direct', session.run(sm1)
    print 'dense sm - with -10 trick', session.run(sm2)
    print 'sparse sm', session.run(sm3)
    print 'densified sparse sm - old', session.run(dm3a)
    print 'densified sparse sm - new', session.run(dm3a)
Beispiel #10
0
def find_non_differentiable(inputs, outputs):
    """Searches through a Tensorflow graph to find non-differentiable elements
    between ``inputs`` and ``outputs`` (elements that would prevent us from
    computing ``d_outputs / d_inputs``.

    Parameters
    ----------
    inputs : list of ``tf.Tensor``
        input tensors
    outputs : list of ``tf.Tensor``
        output tensors
    """
    for o in outputs:
        if o in inputs:
            continue
        else:
            try:
                grad = get_gradient_function(o.op)

                if grad is None and len(o.op.inputs) > 0:
                    # note: technically we're not sure that this op is
                    # on the path to inputs. we could wait and propagate this
                    # until we find inputs, but that can take a long time for
                    # large graphs. it seems more useful to fail quickly, and
                    # risk some false positives
                    raise LookupError
                find_non_differentiable(inputs, o.op.inputs)
            except LookupError:
                raise SimulationError("Graph contains non-differentiable "
                                      "elements: %s" % o.op)
Beispiel #11
0
def pyfunc_test():

    # create data
    x_data = tf.placeholder(dtype=tf.float32, shape=[None])
    y_data = tf.placeholder(dtype=tf.float32, shape=[None])

    w = tf.Variable(tf.constant([0.5]))
    b = tf.Variable(tf.zeros([1]))

    y1 = tf.multiply(w, x_data, name='y1')
    y2 = py_func(addone, [y1], [tf.float32], grad=addone_grad)[0]
    y = tf.add(y2, b)

    loss = tf.reduce_mean(tf.square(y - y_data))
    optimizer = tf.train.GradientDescentOptimizer(0.01)
    train = optimizer.minimize(loss)

    print("Pyfunc grad", ops.get_gradient_function(y2.op))
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())

        for step in range(10):
            #            ran = np.random.rand(115).astype(np.float32)
            ran = np.ones((115)).astype(np.float32)
            ans = ran * 1.5 + 3
            dic = {x_data: ran, y_data: ans}
            tt, yy, yy1 = sess.run([train, y1, y2], feed_dict=dic)
            if step % 1 == 0:
                print('step {}'.format(step))
                print('{}, {}'.format(w.eval(), b.eval()))

        test = sess.run(y, feed_dict={x_data: [1]})
        print('test = {}'.format(test))
Beispiel #12
0
 def testOverrideGradients(self):
   g = ops.Graph()
   x = an_op(g)
   with g.gradient_override_map({"copy": "copy_override"}):
     y = copy_op(x)
   fn = ops.get_gradient_function(y.op)
   self.assertEqual(_CopyOverrideGrad, fn)
 def _Gradient(tensors, devices):
     reduce_tensors, _ = nccl_reduce(tensors, devices)
     tensor_ops = [t.op for t in reduce_tensors]
     d_tensors = _DeviceTensors(tensors, devices)
     grad_tensors = [
         ops.get_gradient_function(op)(op, loss)
         for op, loss in zip(tensor_ops, d_tensors)
     ]
     return grad_tensors, []
Beispiel #14
0
 def _Gradient(tensors, devices):
   reduce_tensors, _ = nccl_reduce(tensors, devices)
   tensor_ops = [t.op for t in reduce_tensors]
   d_tensors = _DeviceTensors(tensors, devices)
   grad_tensors = [
       ops.get_gradient_function(op)(op, loss)
       for op, loss in zip(tensor_ops, d_tensors)
   ]
   return grad_tensors, []
def _lstm_grad_op(session, verbose=True):
    """
  :param tf.Session session:
  :return: grad function
  """
    lstm_grad_op = find_op_by_type(session=session,
                                   type_name="LstmGenericBase")
    assert lstm_grad_op is not None
    if verbose: print("op:", lstm_grad_op)

    from tensorflow.python.framework import ops
    grad_func = ops.get_gradient_function(lstm_grad_op)
    if verbose: print("grad_func:", grad_func)
    grad_op = grad_func.grad_op
    if verbose: print("grad_op:", grad_op, grad_op.__doc__)
    return grad_op
Beispiel #16
0
def show_parent_of_tensor(tensor, prefix=""):

    global print_ops_printed

    for opr in active_session.graph.get_operations():
        if tensor in opr.outputs:

            # if the parent operating is Indentity then leapfrog this to it's parent
            if opr.type == "Identity":
                show_parent_of_tensor(opr.inputs[0], prefix)

            else:  # not an Identity op so recurse normally

                # if this op has inputs then find if this operation has a gradients function
                grad_string = ''
                if len(opr.inputs) > 0:
                    grad_string = " \033[92mGrads\033[0m"
                    try:
                        if get_gradient_function(opr) is None:
                            grad_string = ' \033[91mNo Grads\033[0m'
                    except LookupError:
                        grad_string = ' \033[91mlookup Error\033[0m'

                print("%s  [\033[35m%s\033[0m \033[34m\"%s\"\033[0m] %s" %
                      (prefix, opr.type, opr.node_def.name, grad_string))

                # test if this operation has already been printed as prior to a previous op
                if opr in print_ops_printed:
                    print("%s   . . ." % prefix)
                    return opr

                print_ops_printed += [opr]

                for i in range(len(opr.inputs)):
                    parent_tensor = opr.inputs[i]
                    print(
                        "%s  |<\"%s\" with size %s and type %s>" %
                        (prefix, parent_tensor.name, str(parent_tensor.shape),
                         str(parent_tensor.dtype.base_dtype)))
                    if i == len(opr.inputs) - 1:
                        show_parent_of_tensor(parent_tensor, (prefix + "   "))
                    else:
                        show_parent_of_tensor(parent_tensor, (prefix + "  |"))

            return opr
Beispiel #17
0
def get_grad_results(max_val, pool_op, input_shape, window_shape, stride_shape,
                     padding):
    """
    Construct and run a Tensorflow graph to compute pooling backprop values.

    Will create an input tensor of the required size filled with values 1, 2,
    3... and use these to compute the pooling, then create another tensor with
    the same values to use as the errors to back-propagate.
    Returns the computed values in a numpy array.
    """
    with tf.Graph().as_default():
        total_inp_size = np.product(input_shape)
        input_vals = helpers.get_tensor_data(total_inp_size, max_val)
        inp_tensor = tf.constant(input_vals,
                                 shape=input_shape,
                                 dtype=np.float64)

        pool_output = tf.nn.pool(inp_tensor,
                                 window_shape=window_shape,
                                 pooling_type=TF_OPERATOR_MAP[pool_op],
                                 strides=stride_shape,
                                 padding=padding,
                                 name='pool',
                                 data_format="NHWC")

        tf_op = tf.get_default_graph().get_operation_by_name('pool')
        grad_fn = get_gradient_function(tf_op)
        output_shape = pool_output.shape

        total_out_size = np.product(output_shape)
        error_vals = helpers.get_tensor_data(total_out_size, max_val)
        error_tensor = tf.constant(error_vals,
                                   shape=output_shape,
                                   dtype=np.float64)

        output = grad_fn(tf_op, error_tensor)

        with tf.Session() as sess:
            init = tf.global_variables_initializer()
            sess.run(init)
            sess.graph.finalize()
            return sess.run(output)
def gradients(ys,
              xs,
              grad_ys=None,
              stop_grads=None,
              group_aggregations=8,
              custom_matmul_grad=True):

    if group_aggregations > 8 or group_aggregations < 1:
        raise ValueError(
            "gradients: group_aggregation sizes of 1-8 supported.")

    ys = _AsList(ys)
    xs = [x.value() if isinstance(x, tf.Variable) else x for x in _AsList(xs)]

    stop_grads = [] if stop_grads is None else _AsList(stop_grads)

    grad_ys = [None] * len(ys) if grad_ys is None else _AsList(grad_ys)
    assert len(ys) == len(grad_ys)

    with ops.name_scope("gradients"):

        for i, dy in enumerate(grad_ys):
            if dy is None:
                # float grads start at ones by default
                grad_ys[i] = tf.fill(
                    tf.shape(ys[i]),
                    tf.constant(1.0, dtype=ys[i].dtype, name=f"grad_ys_{i}"))

        ys_ops = [t.op for t in ys]
        xs_ops = [t.op for t in xs]

        pending_count, reachable_ys_ops, recompute_ops = _PendingCount(
            ys_ops, xs_ops)

        # The set of ops that terminate the gradient computation.
        # Confirm that our xs tensors are just endpoints in the graph.
        # Also set any externally provided stop grad ops.
        stop_ops = set(t.op for t in stop_grads)
        for op in xs_ops:
            is_stop_op = True
            for x in op.inputs:
                if x.op in pending_count:
                    is_stop_op = False
                    break
            if is_stop_op:
                stop_ops.add(op)

        # Each op output has an associated list of gradient inputs
        # If more than one, these need to be accumulated.
        # Add the initial gradients for the ys.
        grads = dict()
        for y, dy in zip(ys, grad_ys):
            _SetGrad(grads, y, dy)

        # Add the unique ys ops that are ready into the queue.
        queue = collections.deque()
        for op in reachable_ys_ops:
            # an op is ready if it has no dependecies
            if op not in pending_count:
                queue.append(op)

        while queue:
            op = queue.popleft()

            # only pending_count==0 ops are in the queue so all grad input lists are fully populated
            # go ahead and apply any needed add_n ops to these lists.
            dys = _AggregatedGrads(grads, op, group_aggregations)

            # confirm that we have at least one tensor to compute and that this isn't a stop grad op
            if any(dy is not None for dy in dys) and op not in stop_ops:
                # get the grad function for this op
                try:
                    if custom_matmul_grad and op.type == "MatMul" and not op.get_attr(
                            "transpose_a") and not op.get_attr("transpose_b"):
                        grad_fn = _MatMulGradNN
                    else:
                        grad_fn = ops.get_gradient_function(op)
                except LookupError:
                    raise LookupError(
                        f"No gradient defined for operation '{op.name}' (op type: {op.type})"
                    )

                # for any missing input grads, build a zero input of the right dtype/shape
                for i, dy in enumerate(dys):
                    if dy is None:
                        dys[i] = tf.zeros_like(op.outputs[i])

                # call the grad function with the forward op node and list of grad inputs
                with ops.name_scope(op.name + "_grad"):
                    dxs = _AsList(grad_fn(op, *dys))

                    if len(dxs) != len(op.inputs):
                        raise ValueError(
                            f"Num gradients {len(dxs)} generated for op {op.node_def} do not match num inputs {len(op.inputs)}"
                        )

                    #_LogOpGradients(op, dys, dxs)
            else:
                dxs = [None] * len(op.inputs)

            for i, (x, dx) in enumerate(zip(op.inputs, dxs)):
                if dx is not None:
                    # force unsorted_segment_sum call
                    if isinstance(dx, ops.IndexedSlices):
                        dx = tf.convert_to_tensor(dx)
                        #dx = emb.embedding_lookup_grad_op(dx.values, dx.indices, dx.dense_shape[0])

                    # do some shape sanity checking
                    try:
                        dx.set_shape(x.shape)
                    except ValueError:
                        raise ValueError(
                            "Incompatible shapes between op input {x.shape} and calculated input gradient {dx.shape} for {op.name} (idx:{i})"
                        )

                    # update the input grad list for the consumer of this gradient
                    _SetGrad(grads, x, dx)

            # Update pending count for the inputs of op and enqueue any ready ops
            for x in op.inputs:
                # only traverse nodes that are in the reachable gradient path (and hence have a pending entry)
                count = pending_count.get(x.op)
                if count is not None:
                    if count == 1:
                        # when count is 1 this should be last time we reach this node
                        queue.append(x.op)
                    pending_count[x.op] = count - 1

    # Disconnect the recomputed portion of the graph from the forward pass.
    # This was only needed to direct the gradient flow.
    # Leaving these connections in place would create a circular dependancy (from added control inputs).
    for op in recompute_ops:
        # Just overwrite the backward inputs with a copy of the forward inputs.
        n_out = len(op.outputs)
        for i, x in enumerate(op.inputs[:n_out]):
            op._update_input(i + n_out, x)

    return [_GetGrad(grads, x) for x in xs]
Beispiel #19
0
  def create_op(self, *args, **kwargs):
    """Creates an `Operation`.

    For operations of the following form

      orig_value = op(*args, **kwargs)

    this function constructs the following subgraph :

      v = Variable()
      if v is not initialized:
        orig_value = op(*args, **kwargs)
        v.assign(orig_value) # Initializes v
        return orig_value
      else:
        return v

    The above transformation is not performed and the original op is returned
    as is if any of the following is true:
    * `_return_as_is` flag is set to true.
    * op_type is listed in _PASS_THROUGH_OPS
    * op has no outputs.
    * One of the op's return value has a ref type.

    Args:
      *args: Arguments for create_op()
      **kwargs: Keyword arguments for create_op(). Refer to
        tensorflow.python.framework.ops.Graph.create_op() for the mandatory
        and optional arguments.

    Returns:
      An Operation.

    Raises:
      UnimplementedError: if output type is a reference and the op's type
        is not one of the supported types in `_REF_OPS_WHITELIST`.
    """
    op_type = kwargs['op_type'] if 'op_type' in kwargs else args[0]
    output_dtypes = kwargs['dtypes'] if 'dtypes' in kwargs else args[2]
    output_dtypes = [dtypes.as_dtype(d) for d in output_dtypes]

    if self._return_as_is or op_type in _PASS_THROUGH_OPS:
      return self._wrap(super(ImperativeGraph, self).create_op(*args, **kwargs))

    if not output_dtypes:
      return self._wrap(
          super(ImperativeGraph, self).create_op(*args, **kwargs))

    output_has_ref = any([dtype._is_ref_dtype for dtype in output_dtypes])  # pylint: disable=protected-access

    if output_has_ref:
      if op_type not in _REF_OPS_WHITELIST:
        raise errors.UnimplementedError(None, None,
                                        op_type + ' op not supported in '
                                        'imperative graph')

      ret = super(ImperativeGraph, self).create_op(*args, **kwargs)

      if self._in_variable_creation:
        if op_type == 'Assign':
          self.add_pending_init(ret)

      return self._wrap(ret)

    with self.return_as_is():
      # Declares the variables to hold the output values of this op.
      op_output_var = [state_ops.variable_op_v2(
          tensor_shape.TensorShape(None), dtype, container=self._name)
                       for dtype in output_dtypes]
      # Ops to free the resources used by the temporary cache variables.
      # The following two ops are created for each cache variable,
      # having no control dependencies on any other ops :
      # var_handle_op ----> destroy_resource_op
      for dtype, v in zip(output_dtypes, op_output_var):
        with ops.control_dependencies(None):
          self._variable_cleanup_ops += [
              gen_resource_variable_ops.destroy_resource_op(
                  gen_resource_variable_ops.var_handle_op(
                      dtype, tensor_shape.TensorShape(None),
                      container=self._name, shared_name=v.op.name),
                  ignore_lookup_error=True)]

      # Create the conditional to run the original op only when the variable
      # corresponding to the first output is not initialized.
      inited = state_ops.is_variable_initialized(op_output_var[0])
      v_f, v_t = control_flow_ops.ref_switch(op_output_var[0], inited)
      # pylint: disable=protected-access
      v_f_op = gen_array_ops._ref_identity(v_f)
      v_t_op = gen_array_ops._ref_identity(v_t)
      # pylint: enable=protected-access

      with ops.control_dependencies([v_f_op.op]):
        # Create the original op
        orig_op = self._wrap(
            super(ImperativeGraph, self).create_op(*args, **kwargs))
      shapes = [val.get_shape() for val in orig_op.outputs]

      controls = []
      for var, val in zip(op_output_var, orig_op.outputs):
        if (not val.get_shape().is_fully_defined() or
            val.get_shape().num_elements() > 0):
          assign_op = state_ops.assign(var, val, validate_shape=False)
          assign_op.set_shape(val.get_shape())
          controls.append(assign_op)

      values = []
      if len(controls) > 1:
        if control_flow_ops.IsSwitch(orig_op):
          # pylint: disable=protected-access
          controls = gen_control_flow_ops._ref_merge(controls)
          # pylint: enable=protected-access
        else:
          controls = control_flow_ops.tuple(controls)

      for var, val in zip(op_output_var, orig_op.outputs):
        with ops.control_dependencies(controls):
          with self.colocate_with(v_f_op):
            real_val = array_ops.identity(val)
        with ops.control_dependencies([v_t_op.op]):
          with self.colocate_with(v_t_op):
            stored_val = array_ops.identity(var)
          stored_val.set_shape(val.get_shape())
          real_val, _ = control_flow_ops.merge([real_val, stored_val])
        real_val.op.node_def.attr['_gradient_op_type'].CopyFrom(
            attr_value_pb2.AttrValue(s=compat.as_bytes(self._merge_op_type)))
        values.append(real_val)

      for i, _ in enumerate(shapes):
        values[i].set_shape(shapes[i])
      self._outputs_map[orig_op.name] = values
      try:
        self._gradient_function_map[orig_op.name] = ops.get_gradient_function(
            orig_op)
      except (KeyError, LookupError):
        pass
      else:
        orig_op.node_def.attr['_gradient_op_type'].CopyFrom(
            attr_value_pb2.AttrValue(
                s=compat.as_bytes(self._imperative_op_type)))

      return MultiOutputOperation(values, orig_op)
Beispiel #20
0
 def testRegisterGradients(self):
   g = ops.Graph()
   x = an_op(g)
   y = copy_op(x)
   fn = ops.get_gradient_function(y.op)
   self.assertEqual(_CopyGrad, fn)
Beispiel #21
0
def gradients(ys,
              xs,
              grad_ys=None,
              name="gradients",
              colocate_gradients_with_ops=False,
              gate_gradients=False,
              aggregation_method=None,
              stop_gradients=None):
    """Constructs symbolic derivatives of sum of `ys` w.r.t. x in `xs`.

  `ys` and `xs` are each a `Tensor` or a list of tensors.  `grad_ys`
  is a list of `Tensor`, holding the gradients received by the
  `ys`. The list must be the same length as `ys`.

  `gradients()` adds ops to the graph to output the derivatives of `ys` with
  respect to `xs`.  It returns a list of `Tensor` of length `len(xs)` where
  each tensor is the `sum(dy/dx)` for y in `ys`.

  `grad_ys` is a list of tensors of the same length as `ys` that holds
  the initial gradients for each y in `ys`.  When `grad_ys` is None,
  we fill in a tensor of '1's of the shape of y for each y in `ys`.  A
  user can provide their own initial `grad_ys` to compute the
  derivatives using a different initial gradient for each y (e.g., if
  one wanted to weight the gradient differently for each value in
  each y).

  `stop_gradients` is a `Tensor` or a list of tensors to be considered constant
  with respect to all `xs`. These tensors will not be backpropagated through,
  as though they had been explicitly disconnected using `stop_gradient`.  Among
  other things, this allows computation of partial derivatives as opposed to
  total derivatives. For example:

  ```python
  a = tf.constant(0.)
  b = 2 * a
  g = tf.gradients(a + b, [a, b], stop_gradients=[a, b])
  ```

  Here the partial derivatives `g` evaluate to `[1.0, 1.0]`, compared to the
  total derivatives `tf.gradients(a + b, [a, b])`, which take into account the
  influence of `a` on `b` and evaluate to `[3.0, 1.0]`.  Note that the above is
  equivalent to:

  ```python
  a = tf.stop_gradient(tf.constant(0.))
  b = tf.stop_gradient(2 * a)
  g = tf.gradients(a + b, [a, b])
  ```

  `stop_gradients` provides a way of stopping gradient after the graph has
  already been constructed, as compared to `tf.stop_gradient` which is used
  during graph construction.  When the two approaches are combined,
  backpropagation stops at both `tf.stop_gradient` nodes and nodes in
  `stop_gradients`, whichever is encountered first.

  Args:
    ys: A `Tensor` or list of tensors to be differentiated.
    xs: A `Tensor` or list of tensors to be used for differentiation.
    grad_ys: Optional. A `Tensor` or list of tensors the same size as
      `ys` and holding the gradients computed for each y in `ys`.
    name: Optional name to use for grouping all the gradient ops together.
      defaults to 'gradients'.
    colocate_gradients_with_ops: If True, try colocating gradients with
      the corresponding op.
    gate_gradients: If True, add a tuple around the gradients returned
      for an operations.  This avoids some race conditions.
    aggregation_method: Specifies the method used to combine gradient terms.
      Accepted values are constants defined in the class `AggregationMethod`.
    stop_gradients: Optional. A `Tensor` or list of tensors not to differentiate
      through.

  Returns:
    A list of `sum(dy/dx)` for each x in `xs`.

  Raises:
    LookupError: if one of the operations between `x` and `y` does not
      have a registered gradient function.
    ValueError: if the arguments are invalid.
    RuntimeError: if called in Eager mode.

  """
    if context.in_eager_mode():
        raise RuntimeError("tf.gradients not supported in EAGER mode. Use "
                           "functions in tf.contrib.eager.backprop instead.")
    ys = _AsList(ys)
    xs = _AsList(xs)
    stop_gradients = [] if stop_gradients is None else _AsList(stop_gradients)
    if grad_ys is None:
        grad_ys = [None] * len(ys)
    else:
        grad_ys = _AsList(grad_ys)

    with ops.name_scope(
            name, "gradients",
            list(ys) + list(xs) + list(stop_gradients) +
            list(grad_ys)) as grad_scope:
        ys = ops.convert_n_to_tensor_or_indexed_slices(ys, name="y")
        xs = [
            x.handle
            if isinstance(x, resource_variable_ops.ResourceVariable) else x
            for x in xs
        ]
        xs = ops.internal_convert_n_to_tensor_or_indexed_slices(xs,
                                                                name="x",
                                                                as_ref=True)
        grad_ys = _DefaultGradYs(grad_ys, ys, colocate_gradients_with_ops)

        # The approach we take here is as follows: Create a list of all ops in the
        # subgraph between the ys and xs.  Visit these ops in reverse order of ids
        # to ensure that when we visit an op the gradients w.r.t its outputs have
        # been collected.  Then aggregate these gradients if needed, call the op's
        # gradient function, and add the generated gradients to the gradients for
        # its input.

        # Initialize the pending count for ops in the connected subgraph from ys
        # to the xs.
        if len(ys) > 1:
            ys = [array_ops.identity(y) if y.consumers() else y for y in ys]
        to_ops = [t.op for t in ys]
        from_ops = [t.op for t in xs]
        stop_gradient_ops = [t.op for t in stop_gradients]
        pending_count, loop_state = _PendingCount(ops.get_default_graph(),
                                                  to_ops, from_ops,
                                                  colocate_gradients_with_ops)

        # Iterate over the collected ops.
        #
        # grads: op => list of gradients received on each output endpoint of the
        # op.  The gradients for each endpoint are initially collected as a list.
        # When it is time to call the op's gradient function, for each endpoint we
        # aggregate the list of received gradients into a Add() Operation if there
        # is more than one.
        grads = {}

        # Add the initial gradients for the ys.
        for y, grad_y in zip(ys, grad_ys):
            _SetGrad(grads, y, grad_y)

        # Initialize queue with to_ops.
        queue = collections.deque()
        # Add the ops in 'to_ops' into the queue.
        to_ops_set = set()
        for op in to_ops:
            # 'ready' handles the case where one output gradient relies on
            # another output's gradient.
            # pylint: disable=protected-access
            ready = (pending_count[op._id] == 0)
            if ready and op._id not in to_ops_set:
                to_ops_set.add(op._id)
                queue.append(op)
            # pylint: enable=protected-access

        if loop_state:
            loop_exits = loop_state.ProcessUnusedLoopExits(
                pending_count, to_ops_set)
            for y in loop_exits:
                if _IsTrainable(y):
                    _SetGrad(grads, y, loop_state.ZerosLikeForExit(y))
                    queue.append(y.op)

        stop_ops = _StopOps(from_ops, stop_gradient_ops, pending_count)
        while queue:
            # generate gradient subgraph for op.
            op = queue.popleft()
            with _maybe_colocate_with(op, colocate_gradients_with_ops):
                if loop_state:
                    loop_state.EnterGradWhileContext(op, before=True)
                out_grads = _AggregatedGrads(grads, op, loop_state,
                                             aggregation_method)
                if loop_state:
                    loop_state.ExitGradWhileContext(op, before=True)

                grad_fn = None
                # pylint: disable=protected-access
                func_call = None
                is_func_call = ops.get_default_graph()._is_function(op.type)
                has_out_grads = any(
                    isinstance(g, ops.Tensor) or g for g in out_grads)
                if has_out_grads and (op._id not in stop_ops):
                    if is_func_call:
                        func_call = ops.get_default_graph()._get_function(
                            op.type)
                        grad_fn = func_call.python_grad_func
                        # pylint: enable=protected-access
                    else:
                        # A grad_fn must be defined, either as a function or as None
                        # for ops that do not have gradients.
                        try:
                            grad_fn = ops.get_gradient_function(op)
                        except LookupError:
                            raise LookupError(
                                "No gradient defined for operation '%s' (op type: %s)"
                                % (op.name, op.type))
                if loop_state:
                    loop_state.EnterGradWhileContext(op, before=False)
                if (grad_fn or is_func_call) and has_out_grads:
                    # NOTE: If _AggregatedGrads didn't compute a value for the i'th id:3537 gh:3538
                    # output, it means that the cost does not depend on output[i],
                    # therefore dC/doutput[i] is 0.
                    for i, out_grad in enumerate(out_grads):
                        if (not isinstance(out_grad, ops.Tensor)
                                and not out_grad) and (
                                    (not grad_fn and is_func_call)
                                    or _IsTrainable(op.outputs[i])):
                            # Only trainable outputs or outputs for a function call that
                            # will use SymbolicGradient get a zero gradient. Gradient
                            # functions should ignore the gradient for other outputs.
                            # TODO (apassos) gradients of resource handles might be an id:3152 gh:3153
                            # issue here because of zeros.
                            if loop_state:
                                out_grads[i] = loop_state.ZerosLike(op, i)
                            else:
                                out_grads[
                                    i] = control_flow_ops.ZerosLikeOutsideLoop(
                                        op, i)
                    with ops.name_scope(op.name + "_grad"):
                        # pylint: disable=protected-access
                        with ops.get_default_graph()._original_op(op):
                            # pylint: enable=protected-access
                            if grad_fn:
                                # If grad_fn was found, do not use SymbolicGradient even for
                                # functions.
                                in_grads = _MaybeCompile(
                                    grad_scope, op, func_call,
                                    lambda: grad_fn(op, *out_grads))
                            else:
                                # For function call ops, we add a 'SymbolicGradient'
                                # node to the graph to compute gradients.
                                in_grads = _MaybeCompile(
                                    grad_scope, op, func_call,
                                    lambda: _SymGrad(op, out_grads))
                            in_grads = _AsList(in_grads)
                            _VerifyGeneratedGradients(in_grads, op)
                            if gate_gradients and len(
                                [x for x in in_grads if x is not None]) > 1:
                                with ops.device(None):
                                    with ops.colocate_with(
                                            None, ignore_existing=True):
                                        in_grads = control_flow_ops.tuple(
                                            in_grads)
                    _LogOpGradients(op, out_grads, in_grads)
                else:
                    # If no grad_fn is defined or none of out_grads is available,
                    # just propagate a list of None backwards.
                    in_grads = [None] * len(op.inputs)
                for i, (t_in, in_grad) in enumerate(zip(op.inputs, in_grads)):
                    if in_grad is not None:
                        if (isinstance(in_grad, ops.Tensor)
                                and t_in.dtype != dtypes.resource):
                            try:
                                in_grad.set_shape(t_in.get_shape())
                            except ValueError:
                                raise ValueError(
                                    "Incompatible shapes between op input and calculated "
                                    "input gradient.  Forward operation: %s.  Input index: %d. "
                                    "Original input shape: %s.  "
                                    "Calculated input gradient shape: %s" %
                                    (op.name, i, t_in.shape, in_grad.shape))
                        _SetGrad(grads, t_in, in_grad)
                if loop_state:
                    loop_state.ExitGradWhileContext(op, before=False)

            # Update pending count for the inputs of op and enqueue ready ops.
            _UpdatePendingAndEnqueueReady(grads, op, queue, pending_count,
                                          loop_state)

    if loop_state:
        loop_state.PostProcessing()
    return [_GetGrad(grads, x) for x in xs]
import tensorflow as tf
from tensorflow.python.framework.ops import get_gradient_function
import numpy as np

a = tf.add(1, 2, name="Add_these_numbers")
b = tf.multiply(a, 3, name='mult')

mult = tf.get_default_graph().get_operation_by_name('mult')
print(get_gradient_function(mult))  # <function _MulGrad at 0x7fa29950dc80>

tf.stop_gradient(a, name='stop')
stop = tf.get_default_graph().get_operation_by_name('stop')
print(get_gradient_function(stop))  # None

c = tf.squeeze(a, name="c")
mult = tf.get_default_graph().get_operation_by_name('c')
print(get_gradient_function(mult))  # <function _MulGrad at 0x7fa29950dc80>

indices = np.asarray([[0, 0], [1, 1]])
params = np.asarray([['a', 'b'], ['c', 'd']])

ga = tf.gather_nd(params, indices, name="ga")

mult = tf.get_default_graph().get_operation_by_name('ga')
print(get_gradient_function(mult))  # <function _MulGrad at 0x7fa29950dc80>
am = tf.argmax(a, name="am")
mult = tf.get_default_graph().get_operation_by_name('am')
print(get_gradient_function(mult))  # <function _MulGrad at 0x7fa29950dc80>

indice = np.asarray([[0.1, 0.3], [1.4, 1.32]])
sm = tf.nn.softmax(indice, name="sm")
def gradients(ys,
              xs,
              grad_ys=None,
              name="gradients",
              colocate_gradients_with_ops=False,
              gate_gradients=False,
              aggregation_method=None,
              stop_gradients=None):
  """Constructs symbolic derivatives of sum of `ys` w.r.t. x in `xs`.

  `ys` and `xs` are each a `Tensor` or a list of tensors.  `grad_ys`
  is a list of `Tensor`, holding the gradients received by the
  `ys`. The list must be the same length as `ys`.

  `gradients()` adds ops to the graph to output the derivatives of `ys` with
  respect to `xs`.  It returns a list of `Tensor` of length `len(xs)` where
  each tensor is the `sum(dy/dx)` for y in `ys`.

  `grad_ys` is a list of tensors of the same length as `ys` that holds
  the initial gradients for each y in `ys`.  When `grad_ys` is None,
  we fill in a tensor of '1's of the shape of y for each y in `ys`.  A
  user can provide their own initial `grad_ys` to compute the
  derivatives using a different initial gradient for each y (e.g., if
  one wanted to weight the gradient differently for each value in
  each y).

  `stop_gradients` is a `Tensor` or a list of tensors to be considered constant
  with respect to all `xs`. These tensors will not be backpropagated through,
  as though they had been explicitly disconnected using `stop_gradient`.  Among
  other things, this allows computation of partial derivatives as opposed to
  total derivatives. For example:

  ```python
  a = tf.constant(0.)
  b = 2 * a
  g = tf.gradients(a + b, [a, b], stop_gradients=[a, b])
  ```

  Here the partial derivatives `g` evaluate to `[1.0, 1.0]`, compared to the
  total derivatives `tf.gradients(a + b, [a, b])`, which take into account the
  influence of `a` on `b` and evaluate to `[3.0, 1.0]`.  Note that the above is
  equivalent to:

  ```python
  a = tf.stop_gradient(tf.constant(0.))
  b = tf.stop_gradient(2 * a)
  g = tf.gradients(a + b, [a, b])
  ```

  `stop_gradients` provides a way of stopping gradient after the graph has
  already been constructed, as compared to `tf.stop_gradient` which is used
  during graph construction.  When the two approaches are combined,
  backpropagation stops at both `tf.stop_gradient` nodes and nodes in
  `stop_gradients`, whichever is encountered first.

  Args:
    ys: A `Tensor` or list of tensors to be differentiated.
    xs: A `Tensor` or list of tensors to be used for differentiation.
    grad_ys: Optional. A `Tensor` or list of tensors the same size as
      `ys` and holding the gradients computed for each y in `ys`.
    name: Optional name to use for grouping all the gradient ops together.
      defaults to 'gradients'.
    colocate_gradients_with_ops: If True, try colocating gradients with
      the corresponding op.
    gate_gradients: If True, add a tuple around the gradients returned
      for an operations.  This avoids some race conditions.
    aggregation_method: Specifies the method used to combine gradient terms.
      Accepted values are constants defined in the class `AggregationMethod`.
    stop_gradients: Optional. A `Tensor` or list of tensors not to differentiate
      through.

  Returns:
    A list of `sum(dy/dx)` for each x in `xs`.

  Raises:
    LookupError: if one of the operations between `x` and `y` does not
      have a registered gradient function.
    ValueError: if the arguments are invalid.
    RuntimeError: if called in Eager mode.

  """
  if context.in_eager_mode():
    raise RuntimeError("tf.gradients not supported in EAGER mode. Use "
                       "functions in tf.contrib.eager.backprop instead.")
  ys = _AsList(ys)
  xs = _AsList(xs)
  stop_gradients = [] if stop_gradients is None else _AsList(stop_gradients)
  if grad_ys is None:
    grad_ys = [None] * len(ys)
  else:
    grad_ys = _AsList(grad_ys)

  with ops.name_scope(
      name, "gradients",
      list(ys) + list(xs) + list(stop_gradients) + list(grad_ys)) as grad_scope:
    ys = ops.convert_n_to_tensor_or_indexed_slices(ys, name="y")
    xs = [
        x.handle if isinstance(x, resource_variable_ops.ResourceVariable) else x
        for x in xs
    ]
    xs = ops.internal_convert_n_to_tensor_or_indexed_slices(
        xs, name="x", as_ref=True)
    grad_ys = _DefaultGradYs(grad_ys, ys, colocate_gradients_with_ops)

    # The approach we take here is as follows: Create a list of all ops in the
    # subgraph between the ys and xs.  Visit these ops in reverse order of ids
    # to ensure that when we visit an op the gradients w.r.t its outputs have
    # been collected.  Then aggregate these gradients if needed, call the op's
    # gradient function, and add the generated gradients to the gradients for
    # its input.

    # Initialize the pending count for ops in the connected subgraph from ys
    # to the xs.
    if len(ys) > 1:
      ys = [array_ops.identity(y) if y.consumers() else y for y in ys]
    to_ops = [t.op for t in ys]
    from_ops = [t.op for t in xs]
    stop_gradient_ops = [t.op for t in stop_gradients]
    pending_count, loop_state = _PendingCount(
        ops.get_default_graph(), to_ops, from_ops, colocate_gradients_with_ops)

    # Iterate over the collected ops.
    #
    # grads: op => list of gradients received on each output endpoint of the
    # op.  The gradients for each endpoint are initially collected as a list.
    # When it is time to call the op's gradient function, for each endpoint we
    # aggregate the list of received gradients into a Add() Operation if there
    # is more than one.
    grads = {}

    # Add the initial gradients for the ys.
    for y, grad_y in zip(ys, grad_ys):
      _SetGrad(grads, y, grad_y)

    # Initialize queue with to_ops.
    queue = collections.deque()
    # Add the ops in 'to_ops' into the queue.
    to_ops_set = set()
    for op in to_ops:
      # 'ready' handles the case where one output gradient relies on
      # another output's gradient.
      # pylint: disable=protected-access
      ready = (pending_count[op._id] == 0)
      if ready and op._id not in to_ops_set:
        to_ops_set.add(op._id)
        queue.append(op)
      # pylint: enable=protected-access

    if loop_state:
      loop_exits = loop_state.ProcessUnusedLoopExits(pending_count, to_ops_set)
      for y in loop_exits:
        if _IsTrainable(y):
          _SetGrad(grads, y, loop_state.ZerosLikeForExit(y))
          queue.append(y.op)

    stop_ops = _StopOps(from_ops, stop_gradient_ops, pending_count)
    while queue:
      # generate gradient subgraph for op.
      op = queue.popleft()
      with _maybe_colocate_with(op, colocate_gradients_with_ops):
        if loop_state:
          loop_state.EnterGradWhileContext(op, before=True)
        out_grads = _AggregatedGrads(grads, op, loop_state, aggregation_method)
        if loop_state:
          loop_state.ExitGradWhileContext(op, before=True)

        grad_fn = None
        # pylint: disable=protected-access
        func_call = None
        is_func_call = ops.get_default_graph()._is_function(op.type)
        has_out_grads = any(isinstance(g, ops.Tensor) or g for g in out_grads)
        if has_out_grads and (op._id not in stop_ops):
          if is_func_call:
            func_call = ops.get_default_graph()._get_function(op.type)
            grad_fn = func_call.python_grad_func
            # pylint: enable=protected-access
          else:
            # A grad_fn must be defined, either as a function or as None
            # for ops that do not have gradients.
            try:
              grad_fn = ops.get_gradient_function(op)
            except LookupError:
              raise LookupError(
                  "No gradient defined for operation '%s' (op type: %s)" %
                  (op.name, op.type))
        if loop_state:
          loop_state.EnterGradWhileContext(op, before=False)
        if (grad_fn or is_func_call) and has_out_grads:
          # NOTE: If _AggregatedGrads didn't compute a value for the i'th
          # output, it means that the cost does not depend on output[i],
          # therefore dC/doutput[i] is 0.
          for i, out_grad in enumerate(out_grads):
            if (not isinstance(out_grad, ops.Tensor) and not out_grad) and (
                (not grad_fn and is_func_call) or _IsTrainable(op.outputs[i])):
              # Only trainable outputs or outputs for a function call that
              # will use SymbolicGradient get a zero gradient. Gradient
              # functions should ignore the gradient for other outputs.
              # TODO(apassos) gradients of resource handles might be an
              # issue here because of zeros.
              if loop_state:
                out_grads[i] = loop_state.ZerosLike(op, i)
              else:
                out_grads[i] = control_flow_ops.ZerosLikeOutsideLoop(op, i)
          with ops.name_scope(op.name + "_grad"):
            # pylint: disable=protected-access
            with ops.get_default_graph()._original_op(op):
              # pylint: enable=protected-access
              if grad_fn:
                # If grad_fn was found, do not use SymbolicGradient even for
                # functions.
                in_grads = _MaybeCompile(grad_scope, op, func_call,
                                         lambda: grad_fn(op, *out_grads))
              else:
                # For function call ops, we add a 'SymbolicGradient'
                # node to the graph to compute gradients.
                in_grads = _MaybeCompile(grad_scope, op, func_call,
                                         lambda: _SymGrad(op, out_grads))
              in_grads = _AsList(in_grads)
              _VerifyGeneratedGradients(in_grads, op)
              if gate_gradients and len([x for x in in_grads
                                         if x is not None]) > 1:
                with ops.device(None):
                  with ops.colocate_with(None, ignore_existing=True):
                    in_grads = control_flow_ops.tuple(in_grads)
          _LogOpGradients(op, out_grads, in_grads)
        else:
          # If no grad_fn is defined or none of out_grads is available,
          # just propagate a list of None backwards.
          in_grads = [None] * len(op.inputs)
        for i, (t_in, in_grad) in enumerate(zip(op.inputs, in_grads)):
          if in_grad is not None:
            if (isinstance(in_grad, ops.Tensor) and
                t_in.dtype != dtypes.resource):
              try:
                in_grad.set_shape(t_in.get_shape())
              except ValueError:
                raise ValueError(
                    "Incompatible shapes between op input and calculated "
                    "input gradient.  Forward operation: %s.  Input index: %d. "
                    "Original input shape: %s.  "
                    "Calculated input gradient shape: %s" %
                    (op.name, i, t_in.shape, in_grad.shape))
            _SetGrad(grads, t_in, in_grad)
        if loop_state:
          loop_state.ExitGradWhileContext(op, before=False)

      # Update pending count for the inputs of op and enqueue ready ops.
      _UpdatePendingAndEnqueueReady(grads, op, queue, pending_count, loop_state)

  if loop_state:
    loop_state.PostProcessing()
  return [_GetGrad(grads, x) for x in xs]
Beispiel #24
0
def _GradientsHelper(ys,
                     xs,
                     grad_ys=None,
                     name="gradients",
                     colocate_gradients_with_ops=False,
                     gate_gradients=False,
                     aggregation_method=None,
                     stop_gradients=None,
                     unconnected_gradients=UnconnectedGradients.NONE,
                     src_graph=None):
    """Implementation of gradients()."""
    if context.executing_eagerly():
        raise RuntimeError(
            "tf.gradients is not supported when eager execution "
            "is enabled. Use tf.GradientTape instead.")
    if src_graph is None:
        src_graph = ops.get_default_graph()
    try:
        unconnected_gradients = UnconnectedGradients(unconnected_gradients)
    except ValueError:
        raise ValueError("Unknown value for unconnected_gradients: %r" %
                         unconnected_gradients)

    # If src_graph is a _FuncGraph (i.e. a function body), gather it and all
    # ancestor graphs. This is necessary for correctly handling captured values.
    func_graphs = []
    curr_graph = src_graph
    while _IsFunction(curr_graph):
        func_graphs.append(curr_graph)
        if isinstance(curr_graph, FuncGraph):
            curr_graph = curr_graph.outer_graph
        else:
            assert isinstance(curr_graph, framework_function._FuncGraph)  # pylint: disable=protected-access
            curr_graph = curr_graph._outer_graph  # pylint: disable=protected-access

    ys = _AsList(ys)
    xs = _AsList(xs)
    stop_gradients = [] if stop_gradients is None else _AsList(stop_gradients)
    if grad_ys is None:
        grad_ys = [None] * len(ys)
    else:
        grad_ys = _AsList(grad_ys)

    with ops.name_scope(
            name, "gradients",
            list(ys) + list(xs) + list(stop_gradients) +
            list(grad_ys)) as grad_scope:
        # Get a uid for this call to gradients that can be used to help
        # cluster ops for compilation.
        gradient_uid = ops.get_default_graph().unique_name("uid")
        ys = ops.convert_n_to_tensor_or_indexed_slices(ys, name="y")
        xs = [
            x.handle if resource_variable_ops.is_resource_variable(x) else x
            for x in xs
        ]
        xs = ops.internal_convert_n_to_tensor_or_indexed_slices(xs,
                                                                name="x",
                                                                as_ref=True)
        xs_set = object_identity.ObjectIdentitySet(xs)
        grad_ys = _DefaultGradYs(grad_ys, ys, colocate_gradients_with_ops,
                                 gradient_uid)

        # The approach we take here is as follows: Create a list of all ops in the
        # subgraph between the ys and xs.  Visit these ops in reverse order of ids
        # to ensure that when we visit an op the gradients w.r.t its outputs have
        # been collected.  Then aggregate these gradients if needed, call the op's
        # gradient function, and add the generated gradients to the gradients for
        # its input.

        # Initialize the pending count for ops in the connected subgraph from ys
        # to the xs.
        to_ops = [t.op for t in ys]
        from_ops = [t.op for t in xs]
        stop_gradient_ops = [t.op for t in stop_gradients]
        reachable_to_ops, pending_count, loop_state = _PendingCount(
            to_ops, from_ops, colocate_gradients_with_ops, func_graphs, xs_set)

        # Iterate over the collected ops.
        #
        # grads: op => list of gradients received on each output endpoint of the
        # op.  The gradients for each endpoint are initially collected as a list.
        # When it is time to call the op's gradient function, for each endpoint we
        # aggregate the list of received gradients into a Add() Operation if there
        # is more than one.
        grads = {}

        # Add the initial gradients for the ys.
        for y, grad_y in zip(ys, grad_ys):
            _SetGrad(grads, y, grad_y)

        # Initialize queue with to_ops.
        queue = collections.deque()
        # Add the ops in 'to_ops' into the queue.
        to_ops_set = set()
        for op in to_ops:
            # 'ready' handles the case where one output gradient relies on
            # another output's gradient.
            ready = (pending_count[op] == 0)
            if ready and op not in to_ops_set and op in reachable_to_ops:
                to_ops_set.add(op)
                queue.append(op)

        if loop_state:
            loop_exits = loop_state.ProcessUnusedLoopExits(
                pending_count, to_ops_set)
            for y in loop_exits:
                if backprop_util.IsTrainable(y):
                    _SetGrad(grads, y, loop_state.ZerosLikeForExit(y))
                    queue.append(y.op)

        stop_ops = _StopOps(from_ops, stop_gradient_ops, pending_count, xs_set)
        while queue:
            # generate gradient subgraph for op.
            op = queue.popleft()
            with _maybe_colocate_with(op, gradient_uid,
                                      colocate_gradients_with_ops):
                if loop_state:
                    loop_state.EnterGradWhileContext(op, before=True)
                out_grads = _AggregatedGrads(grads, op, gradient_uid,
                                             loop_state, aggregation_method)
                if loop_state:
                    loop_state.ExitGradWhileContext(op, before=True)

                grad_fn = None
                func_call = None
                is_partitioned_call = _IsPartitionedCall(op)
                # pylint: disable=protected-access
                is_func_call = (src_graph._is_function(op.type)
                                or is_partitioned_call)
                # pylint: enable=protected-access
                has_out_grads = any(
                    isinstance(g, ops.Tensor) or g for g in out_grads)
                if has_out_grads and (op not in stop_ops):
                    try:
                        grad_fn = ops.get_gradient_function(op)
                    except LookupError:
                        if is_func_call:
                            if is_partitioned_call:
                                func_call = src_graph._get_function(  # pylint: disable=protected-access
                                    compat.as_bytes(op.get_attr("f").name))
                            else:
                                func_call = src_graph._get_function(op.type)  # pylint: disable=protected-access
                            # Note that __defun is not set if the graph is
                            # imported. If it's set, we prefer to access the original
                            # defun.
                            func_call = getattr(op, "__defun", func_call)
                            grad_fn = func_call.python_grad_func
                        else:
                            raise LookupError(
                                "No gradient defined for operation '%s' (op type: %s)"
                                % (op.name, op.type))
                if loop_state:
                    loop_state.EnterGradWhileContext(op, before=False)

                # NOTE(skyewm): We don't support computing gradients wrt a loop variable
                # unless it's within the context of a single iteration (i.e. the
                # gradient is wrt to the loop parameter in the body function, not wrt or
                # through the initial value). This means if we're in a while loop
                # context, we should never see a switch node from this context.
                # pylint: disable=protected-access
                if (control_flow_util.IsSwitch(op)
                        and op._control_flow_context is not None
                        and op._control_flow_context.IsWhileContext()
                        and op._control_flow_context ==
                        ops.get_default_graph()._get_control_flow_context()):
                    _RaiseNoGradWrtInitialLoopValError(op, from_ops, xs_set)
                # pylint: enable=protected-access

                if (grad_fn or is_func_call) and has_out_grads:
                    # NOTE: If _AggregatedGrads didn't compute a value for the i'th
                    # output, it means that the cost does not depend on output[i],
                    # therefore dC/doutput[i] is 0.
                    for i, out_grad in enumerate(out_grads):
                        if (not isinstance(out_grad, ops.Tensor)
                                and not out_grad) and (
                                    (not grad_fn and is_func_call) or
                                    backprop_util.IsTrainable(op.outputs[i])):
                            # Only trainable outputs or outputs for a function call that
                            # will use SymbolicGradient get a zero gradient. Gradient
                            # functions should ignore the gradient for other outputs.
                            # TODO(apassos) gradients of resource handles might be an
                            # issue here because of zeros.
                            if loop_state:
                                out_grads[i] = loop_state.ZerosLike(op, i)
                            elif default_gradient.supports_default_grad(
                                    op.outputs[i]):
                                # TODO(b/143286622): The supports_default_grad check is needed
                                # because While op emits non-differentiable resource tensors
                                # as outputs. Remove this check when that is not the case.
                                out_grads[
                                    i] = control_flow_state.ZerosLikeOutsideLoop(
                                        op, i)
                    with ops.name_scope(op.name + "_grad"):
                        # pylint: disable=protected-access
                        with src_graph._original_op(op):
                            # pylint: enable=protected-access
                            if grad_fn:
                                # If grad_fn was found, do not use SymbolicGradient even for
                                # functions.
                                in_grads = _MaybeCompile(
                                    grad_scope, op, func_call,
                                    lambda: grad_fn(op, *out_grads))
                            else:
                                # For function call ops, we add a 'SymbolicGradient'
                                # node to the graph to compute gradients.
                                in_grads = _MaybeCompile(
                                    grad_scope, op, func_call,
                                    lambda: _SymGrad(op, out_grads))
                            in_grads = _AsList(in_grads)
                            _VerifyGeneratedGradients(in_grads, op)
                            if gate_gradients and len(
                                [x for x in in_grads if x is not None]) > 1:
                                with ops.device(None):
                                    with ops._colocate_with_for_gradient(  # pylint: disable=protected-access
                                            None,
                                            gradient_uid,
                                            ignore_existing=True):
                                        in_grads = control_flow_ops.tuple(
                                            in_grads)
                    _LogOpGradients(op, out_grads, in_grads)
                else:
                    # If no grad_fn is defined or none of out_grads is available,
                    # just propagate a list of None backwards.
                    in_grads = [None] * len(_Inputs(op, xs_set))
                # Note: we don't filter out eager inputs here because the inputs need to
                # line up with in_grads.
                for i, (t_in, in_grad) in enumerate(
                        zip(_Inputs(op, xs_set), in_grads)):
                    if in_grad is not None:
                        if (isinstance(in_grad, ops.Tensor)
                                and t_in.dtype != dtypes.resource):
                            try:
                                in_grad.set_shape(t_in.get_shape())
                            except ValueError:
                                raise ValueError(
                                    "Incompatible shapes between op input and calculated "
                                    "input gradient.  Forward operation: %s.  Input index: %d. "
                                    "Original input shape: %s.  "
                                    "Calculated input gradient shape: %s" %
                                    (op.name, i, t_in.shape, in_grad.shape))
                        if not isinstance(t_in, ops.EagerTensor):
                            _SetGrad(grads, t_in, in_grad)
                if loop_state:
                    loop_state.ExitGradWhileContext(op, before=False)

            # Update pending count for the inputs of op and enqueue ready ops.
            _UpdatePendingAndEnqueueReady(grads, op, queue, pending_count,
                                          loop_state, xs_set)

    if loop_state:
        loop_state.PostProcessing()
    return [_GetGrad(grads, x, unconnected_gradients) for x in xs]
Beispiel #25
0
def gradients(ys,
              xs,
              grad_ys=None,
              name="gradients",
              colocate_gradients_with_ops=False,
              gate_gradients=False,
              aggregation_method=None):
    """Constructs symbolic partial derivatives of `ys` w.r.t. x in `xs`.

  `ys` and `xs` are each a `Tensor` or a list of tensors.  `grad_ys`
  is a list of `Tensor`, holding the gradients received by the
  `ys`. The list must be the same length as `ys`.

  `gradients()` adds ops to the graph to output the partial
  derivatives of `ys` with respect to `xs`.  It returns a list of
  `Tensor` of length `len(xs)` where each tensor is the `sum(dy/dx)`
  for y in `ys`.

  `grad_ys` is a list of tensors of the same length as `ys` that holds
  the initial gradients for each y in `ys`.  When `grad_ys` is None,
  we fill in a tensor of '1's of the shape of y for each y in `ys`.  A
  user can provide their own initial `grad_ys` to compute the
  derivatives using a different initial gradient for each y (e.g., if
  one wanted to weight the gradient differently for each value in
  each y).

  Args:
    ys: A `Tensor` or list of tensors to be differentiated.
    xs: A `Tensor` or list of tensors to be used for differentiation.
    grad_ys: Optional. A `Tensor` or list of tensors the same size as
      `ys` and holding the gradients computed for each y in `ys`.
    name: Optional name to use for grouping all the gradient ops together.
      defaults to 'gradients'.
    colocate_gradients_with_ops: If True, try colocating gradients with
      the corresponding op.
    gate_gradients: If True, add a tuple around the gradients returned
      for an operations.  This avoids some race conditions.
    aggregation_method: Specifies the method used to combine gradient terms.
      Accepted values are constants defined in the class `AggregationMethod`.

  Returns:
    A list of `sum(dy/dx)` for each x in `xs`.

  Raises:
    LookupError: if one of the operations between `x` and `y` does not
      have a registered gradient function.
    ValueError: if the arguments are invalid.

  """
    ys = _AsList(ys)
    xs = _AsList(xs)
    if grad_ys is None:
        grad_ys = [None] * len(ys)
    else:
        grad_ys = _AsList(grad_ys)
    with ops.op_scope(ys + xs + grad_ys, name, "gradients"):
        ys = ops.convert_n_to_tensor_or_indexed_slices(ys, name="y")
        xs = ops.convert_n_to_tensor_or_indexed_slices(xs, name="x")
        grad_ys = _DefaultGradYs(grad_ys, ys, colocate_gradients_with_ops)

        # The approach we take here is as follows: Create a list of all ops in the
        # subgraph between the ys and xs.  Visit these ops in reverse order of ids
        # to ensure that when we visit an op the gradients w.r.t its outputs have
        # been collected.  Then aggregate these gradients if needed, call the op's
        # gradient function, and add the generated gradients to the gradients for
        # its input.

        # Initialize the pending count for ops in the connected subgraph from ys
        # to the xs.
        to_ops = [t.op for t in ys]
        from_ops = [t.op for t in xs]
        pending_count, loop_state = _PendingCount(ops.get_default_graph(),
                                                  to_ops, from_ops)

        # Iterate over the collected ops.
        #
        # grads: op => list of gradients received on each output endpoint of the
        # op.  The gradients for each endpoint are initially collected as a list.
        # When it is time to call the op's gradient function, for each endpoint we
        # aggregate the list of received gradients into a Add() Operation if there
        # is more than one.
        grads = {}

        # Add the initial gradients for the ys.
        for y, grad_y in zip(ys, grad_ys):
            _SetGrad(grads, y, grad_y)

        # Initialize queue with to_ops.
        queue = collections.deque()
        # Add the ops in 'to_ops' into the queue.
        to_ops_set = set()
        for op in to_ops:
            # 'ready' handles the case where one output gradient relies on
            # another output's gradient.
            # pylint: disable=protected-access
            ready = (pending_count[op._id] == 0)
            if ready and op._id not in to_ops_set:
                to_ops_set.add(op._id)
                queue.append(op)

        if loop_state:
            # The "unused" exits of the loops are added to ys. As an example,
            # people often write:
            #         v1, _ = While(p, b, [x1, x2])
            #         result = gradients(v1, x1)
            # The exit node of x2 is not included by the betweenness analysis.
            # But we need it if x2 is involved in computing v1. So we add it
            # back in backprop with a zeros_like gradient.
            loop_exits = loop_state.GetAllLoopExits()
            for y in loop_exits:
                if pending_count[y.op._id] == 0 and y.op._id not in to_ops_set:
                    if _IsFloat(y):
                        # Floating-point outputs get a zero gradient.
                        _SetGrad(grads, y, loop_state.ZerosLikeForExit(y))
                    queue.append(y.op)

        # The set of 'from_ops'.
        stop_ops = _StopOps(from_ops, pending_count)
        while queue:
            # generate gradient subgraph for op.
            op = queue.popleft()
            with ops.device(_GetGradsDevice(op, colocate_gradients_with_ops)):
                if loop_state:
                    loop_state.EnterGradWhileContext(op)
                out_grads = _AggregatedGrads(grads, op, loop_state,
                                             aggregation_method)
                grad_fn = None

                # pylint: disable=protected-access
                is_func_call = ops.get_default_graph()._is_function(op.type)
                # pylint: enable=protected-access

                if not is_func_call and any(
                        out_grads) and op._id not in stop_ops:
                    # pylint: enable=protected-access
                    # A grad_fn must be defined, either as a function or as None
                    # for ops that do not have gradients.
                    try:
                        grad_fn = ops.get_gradient_function(op)
                    except LookupError:
                        raise LookupError(
                            "No gradient defined for operation '%s' (op type: %s)"
                            % (op.name, op.type))
                if (grad_fn or is_func_call) and any(out_grads):
                    # NOTE: If _AggregatedGrads didn't compute a value for the i'th
                    # output, it means that the cost does not depend on output[i],
                    # therefore dC/doutput[i] is 0.
                    for i, out_grad in enumerate(out_grads):
                        if not out_grad and _IsFloat(op.outputs[i]):
                            # Only floating-point outputs get a zero gradient. Gradient
                            # functions should ignore the gradient for other outputs.
                            if loop_state:
                                out_grads[i] = loop_state.ZerosLike(op, i)
                            else:
                                out_grads[i] = array_ops.zeros_like(
                                    op.outputs[i])
                    with ops.name_scope(op.name + "_grad"):
                        # pylint: disable=protected-access
                        with ops.get_default_graph()._original_op(op):
                            # pylint: enable=protected-access
                            wrapped_op = op
                            if loop_state:
                                wrapped_op = loop_state.MakeWrapper(op)
                            if is_func_call:
                                # For function call ops, we add a 'SymbolicGradient'
                                # node to the graph to compute gradients.
                                f_in = [x for x in op.inputs] + out_grads
                                f_types = [x.dtype for x in op.inputs]
                                # pylint: disable=protected-access
                                in_grads = _AsList(
                                    functional_ops._symbolic_gradient(
                                        f_in, f_types, op.type))
                                # pylint: enable=protected-access
                            else:
                                in_grads = _AsList(
                                    grad_fn(wrapped_op, *out_grads))
                            _VerifyGeneratedGradients(in_grads, op)
                            if gate_gradients and len(
                                    tuple(filter(None, in_grads))) > 1:
                                in_grads = control_flow_ops.tuple(in_grads)
                    logging.vlog(1, "Gradient for '" + op.name + "'")
                    logging.vlog(1, "  in  --> %s",
                                 ", ".join([x.name for x in out_grads if x]))
                    logging.vlog(1, "  out --> %s",
                                 ", ".join([x.name for x in in_grads if x]))
                else:
                    # If no grad_fn is defined or none of out_grads is available,
                    # just propagates a list of None backwards.
                    in_grads = [None] * len(op.inputs)
                for t_in, in_grad in zip(op.inputs, in_grads):
                    if in_grad:
                        _SetGrad(grads, t_in, in_grad)
                if loop_state:
                    loop_state.ExitGradWhileContext(op)

            # update pending count for the inputs of op.
            # pylint: disable=protected-access
            for x in op.inputs:
                pending_count[x.op._id] -= 1
                ready = (pending_count[x.op._id] == 0)
                if loop_state and not ready:
                    ready = (pending_count[x.op._id] > 0
                             and control_flow_ops.IsLoopSwitch(x.op))
                if ready:
                    queue.append(x.op)
            for x in op.control_inputs:
                pending_count[x._id] -= 1
                if pending_count[x._id] is 0:
                    queue.append(x)
            # pylint: enable=protected-access
    return [_GetGrad(grads, x) for x in xs]
Beispiel #26
0
def gradients(ys,
              xs,
              grad_ys=None,
              name="gradients",
              colocate_gradients_with_ops=False,
              gate_gradients=False,
              aggregation_method=None):
  """Constructs symbolic partial derivatives of `ys` w.r.t. x in `xs`.

  `ys` and `xs` are each a `Tensor` or a list of tensors.  `grad_ys`
  is a list of `Tensor`, holding the gradients received by the
  `ys`. The list must be the same length as `ys`.

  `gradients()` adds ops to the graph to output the partial
  derivatives of `ys` with respect to `xs`.  It returns a list of
  `Tensor` of length `len(xs)` where each tensor is the `sum(dy/dx)`
  for y in `ys`.

  `grad_ys` is a list of tensors of the same length as `ys` that holds
  the initial gradients for each y in `ys`.  When `grad_ys` is None,
  we fill in a tensor of '1's of the shape of y for each y in `ys`.  A
  user can provide their own initial `grad_ys` to compute the
  derivatives using a different initial gradient for each y (e.g., if
  one wanted to weight the gradient differently for each value in
  each y).

  Args:
    ys: A `Tensor` or list of tensors to be differentiated.
    xs: A `Tensor` or list of tensors to be used for differentiation.
    grad_ys: Optional. A `Tensor` or list of tensors the same size as
      `ys` and holding the gradients computed for each y in `ys`.
    name: Optional name to use for grouping all the gradient ops together.
      defaults to 'gradients'.
    colocate_gradients_with_ops: If True, try colocating gradients with
      the corresponding op.
    gate_gradients: If True, add a tuple around the gradients returned
      for an operations.  This avoids some race conditions.
    aggregation_method: Specifies the method used to combine gradient terms.
      Accepted values are constants defined in the class `AggregationMethod`.

  Returns:
    A list of `sum(dy/dx)` for each x in `xs`.

  Raises:
    LookupError: if one of the operations between `x` and `y` does not
      have a registered gradient function.
    ValueError: if the arguments are invalid.

  """
  ys = _AsList(ys)
  xs = _AsList(xs)
  if grad_ys is None:
    grad_ys = [None] * len(ys)
  else:
    grad_ys = _AsList(grad_ys)
  with ops.op_scope(ys + xs + grad_ys, name, "gradients"):
    ys = ops.convert_n_to_tensor_or_indexed_slices(ys, name="y")
    xs = ops.convert_n_to_tensor_or_indexed_slices(xs, name="x")
    grad_ys = _DefaultGradYs(grad_ys, ys, colocate_gradients_with_ops)

    # The approach we take here is as follows: Create a list of all ops in the
    # subgraph between the ys and xs.  Visit these ops in reverse order of ids
    # to ensure that when we visit an op the gradients w.r.t its outputs have
    # been collected.  Then aggregate these gradients if needed, call the op's
    # gradient function, and add the generated gradients to the gradients for
    # its input.

    # Initialize the pending count for ops in the connected subgraph from ys
    # to the xs.
    to_ops = [t.op for t in ys]
    from_ops = [t.op for t in xs]
    pending_count, has_control_flow = _PendingCount(ops.get_default_graph(),
                                                    to_ops, from_ops)

    # Iterate over the collected ops.
    #
    # grads: op => list of gradients received on each output endpoint of the
    # op.  The gradients for each endpoint are initially collected as a list.
    # When it is time to call the op's gradient function, for each endpoint we
    # aggregate the list of received gradients into a Add() Operation if there
    # is more than one.
    grads = {}

    # Add the initial gradients for the ys.
    for y, grad_y in zip(ys, grad_ys):
      _SetGrad(grads, y, grad_y)

    # Initialize queue with to_ops.
    queue = collections.deque()
    # Add the ops in 'to_ops' into the queue.
    to_ops_set = set()
    for op in to_ops:
      # 'ready' handles the case where one output gradient relies on
      # another output's gradient.
      ready = (pending_count[op._id] == 0)
      if ready and op._id not in to_ops_set:  # pylint: disable=protected-access
        to_ops_set.add(op._id)
        queue.append(op)
    # The set of 'from_ops'.
    stop_ops = _StopOps(from_ops, pending_count)
    while queue:
      # generate gradient subgraph for op.
      op = queue.popleft()
      with ops.device(_GetGradsDevice(op, colocate_gradients_with_ops)):
        if has_control_flow:
          control_flow_ops.EnterGradWhileContext(op)
        out_grads = _AggregatedGrads(grads, op, has_control_flow,
                                     aggregation_method)
        grad_fn = None
        if any(out_grads) and op._id not in stop_ops:
          # A grad_fn must be defined, either as a function or as None
          # for ops that do not have gradients.
          try:
            grad_fn = ops.get_gradient_function(op)
          except LookupError:
            raise LookupError(
                "No gradient defined for operation '%s' (op type: %s)" %
                (op.name, op.type))
        if grad_fn and any(out_grads):
          # NOTE: If _AggregatedGrads didn't compute a value for the i'th
          # output, it means that the cost does not depend on output[i],
          # therefore dC/doutput[i] is 0.
          for i, out_grad in enumerate(out_grads):
            if (not out_grad and
                dtypes.as_dtype(op.outputs[i].dtype).base_dtype in
                (dtypes.float32, dtypes.float64)):
              # Only floating-point outputs get a zero gradient. Gradient
              # functions should ignore the gradient for other outputs.
              out_grads[i] = array_ops.zeros_like(op.outputs[i])
          with ops.name_scope(op.name + "_grad"):
            # pylint: disable=protected-access
            with ops.get_default_graph()._original_op(op):
              # pylint: enable=protected-access
              op_wrapper = op
              if has_control_flow:
                op_wrapper = control_flow_ops.MakeWrapper(op)
              in_grads = _AsList(grad_fn(op_wrapper, *out_grads))
              _VerifyGeneratedGradients(in_grads, op)
              if gate_gradients and len(in_grads) > 1:
                in_grads = control_flow_ops.tuple(in_grads)
          logging.vlog(1, "Gradient for '" + op.name + "'")
          logging.vlog(1, "  in  --> %s",
                       ", ".join([x.name for x in out_grads if x]))
          logging.vlog(1, "  out --> %s",
                       ", ".join([x.name for x in in_grads if x]))
        else:
          # If no grad_fn is defined or none of out_grads is available,
          # just propagates a list of None backwards.
          in_grads = [None] * len(op.inputs)
        for t_in, in_grad in zip(op.inputs, in_grads):
          if in_grad:
            _SetGrad(grads, t_in, in_grad)
        if has_control_flow:
          control_flow_ops.ExitGradWhileContext(op)

      # update pending count for the inputs of op.
      for x in op.inputs:
        pending_count[x.op._id] -= 1
        ready = (pending_count[x.op._id] == 0)
        if has_control_flow and not ready:
          ready = (pending_count[x.op._id] > 0 and
                   control_flow_ops.IsLoopSwitch(x.op))
        if ready:
          queue.append(x.op)
      for x in op.control_inputs:
        pending_count[x._id] -= 1
        if pending_count[x._id] is 0:
          queue.append(x)
  return [_GetGrad(grads, x) for x in xs]
    def create_op(self, *args, **kwargs):
        """Creates an `Operation`.

    For operations of the following form

      orig_value = op(*args, **kwargs)

    this function constructs the following subgraph :

      v = Variable()
      if v is not initialized:
        orig_value = op(*args, **kwargs)
        v.assign(orig_value) # Initializes v
        return orig_value
      else:
        return v

    The above transformation is not performed and the original op is returned
    as is if any of the following is true:
    * `_return_as_is` flag is set to true.
    * op_type is listed in _PASS_THROUGH_OPS
    * op has no outputs.
    * One of the op's return value has a ref type.

    Args:
      *args: Arguments for create_op()
      **kwargs: Keyword arguments for create_op(). Refer to
        tensorflow.python.framework.ops.Graph.create_op() for the mandatory
        and optional arguments.

    Returns:
      An Operation.

    Raises:
      UnimplementedError: if output type is a reference and the op's type
        is not one of the supported types in `_REF_OPS_WHITELIST`.
    """
        op_type = kwargs['op_type'] if 'op_type' in kwargs else args[0]
        output_dtypes = kwargs['dtypes'] if 'dtypes' in kwargs else args[2]
        output_dtypes = [dtypes.as_dtype(d) for d in output_dtypes]

        if self._return_as_is or op_type in _PASS_THROUGH_OPS:
            return self._wrap(
                super(ImperativeGraph, self).create_op(*args, **kwargs))

        if not output_dtypes:
            return self._wrap(
                super(ImperativeGraph, self).create_op(*args, **kwargs))

        output_has_ref = any([dtype._is_ref_dtype for dtype in output_dtypes])  # pylint: disable=protected-access

        if output_has_ref:
            if op_type not in _REF_OPS_WHITELIST:
                raise errors.UnimplementedError(
                    None, None, op_type + ' op not supported in '
                    'imperative graph')

            ret = super(ImperativeGraph, self).create_op(*args, **kwargs)

            if self._in_variable_creation:
                if op_type == 'Assign':
                    self.add_pending_init(ret)

            return self._wrap(ret)

        with self.return_as_is():
            # Declares the variables to hold the output values of this op.
            op_output_var = [
                state_ops.variable_op_v2(tensor_shape.TensorShape(None),
                                         dtype,
                                         container=self._name)
                for dtype in output_dtypes
            ]
            # Ops to free the resources used by the temporary cache variables.
            # The following two ops are created for each cache variable,
            # having no control dependencies on any other ops :
            # var_handle_op ----> destroy_resource_op
            for dtype, v in zip(output_dtypes, op_output_var):
                with ops.control_dependencies(None):
                    self._variable_cleanup_ops += [
                        gen_resource_variable_ops.destroy_resource_op(
                            gen_resource_variable_ops.var_handle_op(
                                dtype,
                                tensor_shape.TensorShape(None),
                                container=self._name,
                                shared_name=v.op.name),
                            ignore_lookup_error=True)
                    ]

            # Create the conditional to run the original op only when the variable
            # corresponding to the first output is not initialized.
            inited = state_ops.is_variable_initialized(op_output_var[0])
            v_f, v_t = control_flow_ops.ref_switch(op_output_var[0], inited)
            # pylint: disable=protected-access
            v_f_op = gen_array_ops._ref_identity(v_f)
            v_t_op = gen_array_ops._ref_identity(v_t)
            # pylint: enable=protected-access

            with ops.control_dependencies([v_f_op.op]):
                # Create the original op
                orig_op = self._wrap(
                    super(ImperativeGraph, self).create_op(*args, **kwargs))
            shapes = [val.get_shape() for val in orig_op.outputs]

            controls = []
            for var, val in zip(op_output_var, orig_op.outputs):
                if (not val.get_shape().is_fully_defined()
                        or val.get_shape().num_elements() > 0):
                    assign_op = state_ops.assign(var,
                                                 val,
                                                 validate_shape=False)
                    assign_op.set_shape(val.get_shape())
                    controls.append(assign_op)

            values = []
            if len(controls) > 1:
                if control_flow_ops.IsSwitch(orig_op):
                    # pylint: disable=protected-access
                    controls = gen_control_flow_ops._ref_merge(controls)
                    # pylint: enable=protected-access
                else:
                    controls = control_flow_ops.tuple(controls)

            for var, val in zip(op_output_var, orig_op.outputs):
                with ops.control_dependencies(controls):
                    with self.colocate_with(v_f_op):
                        real_val = array_ops.identity(val)
                with ops.control_dependencies([v_t_op.op]):
                    with self.colocate_with(v_t_op):
                        stored_val = array_ops.identity(var)
                    stored_val.set_shape(val.get_shape())
                    real_val, _ = control_flow_ops.merge(
                        [real_val, stored_val])
                real_val.op.node_def.attr['_gradient_op_type'].CopyFrom(
                    attr_value_pb2.AttrValue(
                        s=compat.as_bytes(self._merge_op_type)))
                values.append(real_val)

            for i, _ in enumerate(shapes):
                values[i].set_shape(shapes[i])
            self._outputs_map[orig_op.name] = values
            try:
                self._gradient_function_map[
                    orig_op.name] = ops.get_gradient_function(orig_op)
            except (KeyError, LookupError):
                pass
            else:
                orig_op.node_def.attr['_gradient_op_type'].CopyFrom(
                    attr_value_pb2.AttrValue(
                        s=compat.as_bytes(self._imperative_op_type)))

            return MultiOutputOperation(values)
Beispiel #28
0
def _GradientsHelper(ys, xs, grad_ys, name, colocate_gradients_with_ops,
                     gate_gradients, aggregation_method, stop_gradients):
    """Implementation of gradients()."""
    if context.executing_eagerly():
        raise RuntimeError("tf.gradients not supported when eager execution "
                           "is enabled. Use tf.contrib.eager.GradientTape "
                           "instead.")
    ys = _AsList(ys)
    xs = _AsList(xs)
    stop_gradients = [] if stop_gradients is None else _AsList(stop_gradients)
    if grad_ys is None:
        grad_ys = [None] * len(ys)
    else:
        grad_ys = _AsList(grad_ys)

    with ops.name_scope(
            name, "gradients",
            list(ys) + list(xs) + list(stop_gradients) +
            list(grad_ys)) as grad_scope:
        # Get a uid for this call to gradients that can be used to help
        # cluster ops for compilation.
        gradient_uid = ops.get_default_graph().unique_name("uid")
        ys = ops.convert_n_to_tensor_or_indexed_slices(ys, name="y")
        xs = [
            x.handle if resource_variable_ops.is_resource_variable(x) else x
            for x in xs
        ]
        xs = ops.internal_convert_n_to_tensor_or_indexed_slices(xs,
                                                                name="x",
                                                                as_ref=True)
        grad_ys = _DefaultGradYs(grad_ys, ys, colocate_gradients_with_ops,
                                 gradient_uid)

        # The approach we take here is as follows: Create a list of all ops in the
        # subgraph between the ys and xs.  Visit these ops in reverse order of ids
        # to ensure that when we visit an op the gradients w.r.t its outputs have
        # been collected.  Then aggregate these gradients if needed, call the op's
        # gradient function, and add the generated gradients to the gradients for
        # its input.

        # Initialize the pending count for ops in the connected subgraph from ys
        # to the xs.
        if len(ys) > 1:
            ys = [array_ops.identity(y) if y.consumers() else y for y in ys]
        to_ops = [t.op for t in ys]
        from_ops = [t.op for t in xs]
        stop_gradient_ops = [t.op for t in stop_gradients]
        reachable_to_ops, pending_count, loop_state = _PendingCount(
            ops.get_default_graph(), to_ops, from_ops,
            colocate_gradients_with_ops)

        # Iterate over the collected ops.
        #
        # grads: op => list of gradients received on each output endpoint of the
        # op.  The gradients for each endpoint are initially collected as a list.
        # When it is time to call the op's gradient function, for each endpoint we
        # aggregate the list of received gradients into a Add() Operation if there
        # is more than one.
        grads = {}

        # Add the initial gradients for the ys.
        for y, grad_y in zip(ys, grad_ys):
            _SetGrad(grads, y, grad_y)

        # Initialize queue with to_ops.
        queue = collections.deque()
        # Add the ops in 'to_ops' into the queue.
        to_ops_set = set()
        for op in to_ops:
            # 'ready' handles the case where one output gradient relies on
            # another output's gradient.
            # pylint: disable=protected-access
            ready = (pending_count[op._id] == 0)
            if ready and op._id not in to_ops_set and op._id in reachable_to_ops:
                to_ops_set.add(op._id)
                queue.append(op)
            # pylint: enable=protected-access

        if loop_state:
            loop_exits = loop_state.ProcessUnusedLoopExits(
                pending_count, to_ops_set)
            for y in loop_exits:
                if _IsTrainable(y):
                    _SetGrad(grads, y, loop_state.ZerosLikeForExit(y))
                    queue.append(y.op)

        stop_ops = _StopOps(from_ops, stop_gradient_ops, pending_count)
        while queue:
            # generate gradient subgraph for op.
            op = queue.popleft()
            with _maybe_colocate_with(op, gradient_uid,
                                      colocate_gradients_with_ops):
                if loop_state:
                    loop_state.EnterGradWhileContext(op, before=True)
                out_grads = _AggregatedGrads(grads, op, gradient_uid,
                                             loop_state, aggregation_method)
                if loop_state:
                    loop_state.ExitGradWhileContext(op, before=True)

                grad_fn = None
                func_call = None
                # pylint: disable=protected-access
                is_func_call = ops.get_default_graph()._is_function(op.type)
                # pylint: enable=protected-access
                has_out_grads = any(
                    isinstance(g, ops.Tensor) or g for g in out_grads)
                if has_out_grads and (op._id not in stop_ops):
                    if is_func_call:
                        func_call = ops.get_default_graph()._get_function(
                            op.type)
                        # Note that __defun is not set if the graph is
                        # imported. If it's set, we prefer to access the original
                        # defun.
                        func_call = getattr(op, "__defun", func_call)
                        grad_fn = func_call.python_grad_func
                    else:
                        # A grad_fn must be defined, either as a function or as None
                        # for ops that do not have gradients.
                        try:
                            grad_fn = ops.get_gradient_function(op)
                        except LookupError:
                            raise LookupError(
                                "No gradient defined for operation '%s' (op type: %s)"
                                % (op.name, op.type))
                if loop_state:
                    loop_state.EnterGradWhileContext(op, before=False)
                if (grad_fn or is_func_call) and has_out_grads:
                    # NOTE: If _AggregatedGrads didn't compute a value for the i'th
                    # output, it means that the cost does not depend on output[i],
                    # therefore dC/doutput[i] is 0.
                    for i, out_grad in enumerate(out_grads):
                        if (not isinstance(out_grad, ops.Tensor)
                                and not out_grad) and (
                                    (not grad_fn and is_func_call)
                                    or _IsTrainable(op.outputs[i])):
                            # Only trainable outputs or outputs for a function call that
                            # will use SymbolicGradient get a zero gradient. Gradient
                            # functions should ignore the gradient for other outputs.
                            # TODO(apassos) gradients of resource handles might be an
                            # issue here because of zeros.
                            if loop_state:
                                out_grads[i] = loop_state.ZerosLike(op, i)
                            else:
                                out_grads[
                                    i] = control_flow_ops.ZerosLikeOutsideLoop(
                                        op, i)
                    with ops.name_scope(op.name + "_grad"):
                        # pylint: disable=protected-access
                        with ops.get_default_graph()._original_op(op):
                            # pylint: enable=protected-access
                            if grad_fn:
                                # If grad_fn was found, do not use SymbolicGradient even for
                                # functions.
                                in_grads = _MaybeCompile(
                                    grad_scope, op, func_call,
                                    lambda: grad_fn(op, *out_grads))
                            else:
                                # For function call ops, we add a 'SymbolicGradient'
                                # node to the graph to compute gradients.
                                in_grads = _MaybeCompile(
                                    grad_scope, op, func_call,
                                    lambda: _SymGrad(op, out_grads))
                            in_grads = _AsList(in_grads)
                            _VerifyGeneratedGradients(in_grads, op)
                            if gate_gradients and len(
                                [x for x in in_grads if x is not None]) > 1:
                                with ops.device(None):
                                    with ops._colocate_with_for_gradient(  # pylint: disable=protected-access
                                            None,
                                            gradient_uid,
                                            ignore_existing=True):
                                        in_grads = control_flow_ops.tuple(
                                            in_grads)
                    _LogOpGradients(op, out_grads, in_grads)
                else:
                    # If no grad_fn is defined or none of out_grads is available,
                    # just propagate a list of None backwards.
                    in_grads = [None] * len(op.inputs)
                for i, (t_in, in_grad) in enumerate(zip(op.inputs, in_grads)):
                    if in_grad is not None:
                        if (isinstance(in_grad, ops.Tensor)
                                and t_in.dtype != dtypes.resource):
                            try:
                                in_grad.set_shape(t_in.get_shape())
                            except ValueError:
                                raise ValueError(
                                    "Incompatible shapes between op input and calculated "
                                    "input gradient.  Forward operation: %s.  Input index: %d. "
                                    "Original input shape: %s.  "
                                    "Calculated input gradient shape: %s" %
                                    (op.name, i, t_in.shape, in_grad.shape))
                        _SetGrad(grads, t_in, in_grad)
                if loop_state:
                    loop_state.ExitGradWhileContext(op, before=False)

            # Update pending count for the inputs of op and enqueue ready ops.
            _UpdatePendingAndEnqueueReady(grads, op, queue, pending_count,
                                          loop_state)

    if loop_state:
        loop_state.PostProcessing()
    return [_GetGrad(grads, x) for x in xs]
Beispiel #29
0
 def testRegisterGradients(self):
     g = ops.Graph()
     x = an_op(g)
     y = copy_op(x)
     fn = ops.get_gradient_function(y.op)
     self.assertEqual(_CopyGrad, fn)
Beispiel #30
0
def gradients(ys, xs, grad_ys=None, name="gradients",
              colocate_gradients_with_ops=False,
              gate_gradients=False,
              aggregation_method=None):
  """Constructs symbolic partial derivatives of `ys` w.r.t. x in `xs`.

  `ys` and `xs` are each a `Tensor` or a list of tensors.  `grad_ys`
  is a list of `Tensor`, holding the gradients received by the
  `ys`. The list must be the same length as `ys`.

  `gradients()` adds ops to the graph to output the partial
  derivatives of `ys` with respect to `xs`.  It returns a list of
  `Tensor` of length `len(xs)` where each tensor is the `sum(dy/dx)`
  for y in `ys`.

  `grad_ys` is a list of tensors of the same length as `ys` that holds
  the initial gradients for each y in `ys`.  When `grad_ys` is None,
  we fill in a tensor of '1's of the shape of y for each y in `ys`.  A
  user can provide their own initial 'grad_ys` to compute the
  derivatives using a different initial gradient for each y (e.g., if
  one wanted to weight the gradient differently for each value in
  each y).

  Args:
    ys: A `Tensor` or list of tensors to be differentiated.
    xs: A `Tensor` or list of tensors to be used for differentiation.
    grad_ys: Optional. A `Tensor` or list of tensors the same size as
      `ys` and holding the gradients computed for each y in `ys`.
    name: Optional name to use for grouping all the gradient ops together.
      defaults to 'gradients'.
    colocate_gradients_with_ops: If True, try colocating gradients with
      the corresponding op.
    gate_gradients: If True, add a tuple around the gradients returned
      for an operations.  This avoids some race conditions.
    aggregation_method: Specifies the method used to combine gradient terms.
      Accepted values are constants defined in the class `AggregationMethod`.

  Returns:
    A list of `sum(dy/dx)` for each x in `xs`.

  Raises:
    LookupError: if one of the operations between `x` and `y` does not
      have a registered gradient function.
    ValueError: if the arguments are invalid.

  """
  ys = _AsList(ys)
  xs = _AsList(xs)
  if grad_ys is None:
    grad_ys = [None] * len(ys)
  else:
    grad_ys = _AsList(grad_ys)
  with ops.op_scope(ys + xs + grad_ys, name, "gradients"):
    ys = ops.convert_n_to_tensor_or_indexed_slices(ys, name="y")
    xs = ops.convert_n_to_tensor_or_indexed_slices(xs, name="x")
    grad_ys = _DefaultGradYs(grad_ys, ys, colocate_gradients_with_ops)

    # The approach we take here is as follows: Create a list of all ops in the
    # subgraph between the ys and xs.  Visit these ops in reverse order of ids
    # to ensure that when we visit an op the gradients w.r.t its outputs have
    # been collected.  Then aggregate these gradients if needed, call the op's
    # gradient function, and add the generated gradients to the gradients for
    # its input.

    # Initialize the pending count for ops in the connected subgraph from ys
    # to the xs.
    to_ops = [t.op for t in ys]
    from_ops = [t.op for t in xs]
    pending_count, has_control_flow = _PendingCount(
        ops.get_default_graph(), to_ops, from_ops)

    # Iterate over the collected ops.
    #
    # grads: op => list of gradients received on each output endpoint of the
    # op.  The gradients for each endpoint are initially collected as a list.
    # When it is time to call the op's gradient function, for each endpoint we
    # aggregate the list of received gradients into a Add() Operation if there
    # is more than one.
    grads = {}

    # Add the initial gradients for the ys.
    for y, grad_y in zip(ys, grad_ys):
      _SetGrad(grads, y, grad_y)

    # Initialize queue with to_ops.
    queue = collections.deque()
    # Add the ops in 'to_ops' into the queue.
    to_ops_set = set()
    for op in to_ops:
      if op._id not in to_ops_set:
        to_ops_set.add(op._id)
        queue.append(op)
    # The set of 'from_ops'.
    stop_ops = _StopOps(from_ops, pending_count)
    while queue:
      # generate gradient subgraph for op.
      op = queue.popleft()
      with ops.device(_GetGradsDevice(op, colocate_gradients_with_ops)):
        if has_control_flow:
          control_flow_ops.EnterGradWhileContext(op)
        out_grads = _AggregatedGrads(grads, op, has_control_flow,
                                     aggregation_method)
        grad_fn = None
        if any(out_grads) and op._id not in stop_ops:
          # A grad_fn must be defined, either as a function or as None
          # for ops that do not have gradients.
          try:
            grad_fn = ops.get_gradient_function(op)
          except LookupError:
            raise LookupError(
                "No gradient defined for operation '%s' (op type: %s)" %
                (op.name, op.type))
        if grad_fn and any(out_grads):
          # NOTE: If _AggregatedGrads didn't compute a value for the i'th
          # output, it means that the cost does not depend on output[i],
          # therefore dC/doutput[i] is 0.
          for i, out_grad in enumerate(out_grads):
            if (not out_grad
                and types.as_dtype(op.outputs[i].dtype).base_dtype in 
def _GradientsHelper(ys, xs, grad_ys, name, colocate_gradients_with_ops,
                     gate_gradients, aggregation_method, stop_gradients):
  """Implementation of gradients()."""
  if context.executing_eagerly():
    raise RuntimeError("tf.gradients not supported when eager execution "
                       "is enabled. Use tf.contrib.eager.GradientTape "
                       "instead.")
  ys = _AsList(ys)
  xs = _AsList(xs)
  stop_gradients = [] if stop_gradients is None else _AsList(stop_gradients)
  if grad_ys is None:
    grad_ys = [None] * len(ys)
  else:
    grad_ys = _AsList(grad_ys)

  with ops.name_scope(
      name, "gradients",
      list(ys) + list(xs) + list(stop_gradients) + list(grad_ys)) as grad_scope:
    ys = ops.convert_n_to_tensor_or_indexed_slices(ys, name="y")
    xs = [
        x.handle if resource_variable_ops.is_resource_variable(x) else x
        for x in xs
    ]
    xs = ops.internal_convert_n_to_tensor_or_indexed_slices(
        xs, name="x", as_ref=True)
    grad_ys = _DefaultGradYs(grad_ys, ys, colocate_gradients_with_ops)

    # The approach we take here is as follows: Create a list of all ops in the
    # subgraph between the ys and xs.  Visit these ops in reverse order of ids
    # to ensure that when we visit an op the gradients w.r.t its outputs have
    # been collected.  Then aggregate these gradients if needed, call the op's
    # gradient function, and add the generated gradients to the gradients for
    # its input.

    # Initialize the pending count for ops in the connected subgraph from ys
    # to the xs.
    if len(ys) > 1:
      ys = [array_ops.identity(y) if y.consumers() else y for y in ys]
    to_ops = [t.op for t in ys]
    from_ops = [t.op for t in xs]
    stop_gradient_ops = [t.op for t in stop_gradients]
    pending_count, loop_state = _PendingCount(
        ops.get_default_graph(), to_ops, from_ops, colocate_gradients_with_ops)

    # Iterate over the collected ops.
    #
    # grads: op => list of gradients received on each output endpoint of the
    # op.  The gradients for each endpoint are initially collected as a list.
    # When it is time to call the op's gradient function, for each endpoint we
    # aggregate the list of received gradients into a Add() Operation if there
    # is more than one.
    grads = {}

    # Add the initial gradients for the ys.
    for y, grad_y in zip(ys, grad_ys):
      _SetGrad(grads, y, grad_y)

    # Initialize queue with to_ops.
    queue = collections.deque()
    # Add the ops in 'to_ops' into the queue.
    to_ops_set = set()
    for op in to_ops:
      # 'ready' handles the case where one output gradient relies on
      # another output's gradient.
      # pylint: disable=protected-access
      ready = (pending_count[op._id] == 0)
      if ready and op._id not in to_ops_set:
        to_ops_set.add(op._id)
        queue.append(op)
      # pylint: enable=protected-access

    if loop_state:
      loop_exits = loop_state.ProcessUnusedLoopExits(pending_count, to_ops_set)
      for y in loop_exits:
        if _IsTrainable(y):
          _SetGrad(grads, y, loop_state.ZerosLikeForExit(y))
          queue.append(y.op)

    stop_ops = _StopOps(from_ops, stop_gradient_ops, pending_count)
    while queue:
      # generate gradient subgraph for op.
      op = queue.popleft()
      with _maybe_colocate_with(op, colocate_gradients_with_ops):
        if loop_state:
          loop_state.EnterGradWhileContext(op, before=True)
        out_grads = _AggregatedGrads(grads, op, loop_state, aggregation_method)
        if loop_state:
          loop_state.ExitGradWhileContext(op, before=True)

        grad_fn = None
        # pylint: disable=protected-access
        func_call = None
        is_func_call = ops.get_default_graph()._is_function(op.type)
        has_out_grads = any(isinstance(g, ops.Tensor) or g for g in out_grads)
        if has_out_grads and (op._id not in stop_ops):
          if is_func_call:
            func_call = ops.get_default_graph()._get_function(op.type)
            grad_fn = func_call.python_grad_func
            # pylint: enable=protected-access
          else:
            # A grad_fn must be defined, either as a function or as None
            # for ops that do not have gradients.
            try:
              grad_fn = ops.get_gradient_function(op)
            except LookupError:
              raise LookupError(
                  "No gradient defined for operation '%s' (op type: %s)" %
                  (op.name, op.type))
        if loop_state:
          loop_state.EnterGradWhileContext(op, before=False)
        if (grad_fn or is_func_call) and has_out_grads:
          # NOTE: If _AggregatedGrads didn't compute a value for the i'th
          # output, it means that the cost does not depend on output[i],
          # therefore dC/doutput[i] is 0.
          for i, out_grad in enumerate(out_grads):
            if (not isinstance(out_grad, ops.Tensor) and not out_grad) and (
                (not grad_fn and is_func_call) or _IsTrainable(op.outputs[i])):
              # Only trainable outputs or outputs for a function call that
              # will use SymbolicGradient get a zero gradient. Gradient
              # functions should ignore the gradient for other outputs.
              # TODO(apassos) gradients of resource handles might be an
              # issue here because of zeros.
              if loop_state:
                out_grads[i] = loop_state.ZerosLike(op, i)
              else:
                out_grads[i] = control_flow_ops.ZerosLikeOutsideLoop(op, i)
          with ops.name_scope(op.name + "_grad"):
            # pylint: disable=protected-access
            with ops.get_default_graph()._original_op(op):
              # pylint: enable=protected-access
              if grad_fn:
                # If grad_fn was found, do not use SymbolicGradient even for
                # functions.
                in_grads = _MaybeCompile(grad_scope, op, func_call,
                                         lambda: grad_fn(op, *out_grads))
              else:
                # For function call ops, we add a 'SymbolicGradient'
                # node to the graph to compute gradients.
                in_grads = _MaybeCompile(grad_scope, op, func_call,
                                         lambda: _SymGrad(op, out_grads))
              in_grads = _AsList(in_grads)
              _VerifyGeneratedGradients(in_grads, op)
              if gate_gradients and len([x for x in in_grads
                                         if x is not None]) > 1:
                with ops.device(None):
                  with ops.colocate_with(None, ignore_existing=True):
                    in_grads = control_flow_ops.tuple(in_grads)
          _LogOpGradients(op, out_grads, in_grads)
        else:
          # If no grad_fn is defined or none of out_grads is available,
          # just propagate a list of None backwards.
          in_grads = [None] * len(op.inputs)
        for i, (t_in, in_grad) in enumerate(zip(op.inputs, in_grads)):
          if in_grad is not None:
            if (isinstance(in_grad, ops.Tensor) and
                t_in.dtype != dtypes.resource):
              try:
                in_grad.set_shape(t_in.get_shape())
              except ValueError:
                raise ValueError(
                    "Incompatible shapes between op input and calculated "
                    "input gradient.  Forward operation: %s.  Input index: %d. "
                    "Original input shape: %s.  "
                    "Calculated input gradient shape: %s" %
                    (op.name, i, t_in.shape, in_grad.shape))
            _SetGrad(grads, t_in, in_grad)
        if loop_state:
          loop_state.ExitGradWhileContext(op, before=False)

      # Update pending count for the inputs of op and enqueue ready ops.
      _UpdatePendingAndEnqueueReady(grads, op, queue, pending_count, loop_state)

  if loop_state:
    loop_state.PostProcessing()
  return [_GetGrad(grads, x) for x in xs]
Beispiel #32
0
def gradients(ys,
              xs,
              grad_ys=None,
              name="gradients",
              colocate_gradients_with_ops=False,
              gate_gradients=False,
              aggregation_method=None):
  """Constructs symbolic partial derivatives of sum of `ys` w.r.t. x in `xs`.

  `ys` and `xs` are each a `Tensor` or a list of tensors.  `grad_ys`
  is a list of `Tensor`, holding the gradients received by the
  `ys`. The list must be the same length as `ys`.

  `gradients()` adds ops to the graph to output the partial
  derivatives of `ys` with respect to `xs`.  It returns a list of
  `Tensor` of length `len(xs)` where each tensor is the `sum(dy/dx)`
  for y in `ys`.

  `grad_ys` is a list of tensors of the same length as `ys` that holds
  the initial gradients for each y in `ys`.  When `grad_ys` is None,
  we fill in a tensor of '1's of the shape of y for each y in `ys`.  A
  user can provide their own initial `grad_ys` to compute the
  derivatives using a different initial gradient for each y (e.g., if
  one wanted to weight the gradient differently for each value in
  each y).

  Args:
    ys: A `Tensor` or list of tensors to be differentiated.
    xs: A `Tensor` or list of tensors to be used for differentiation.
    grad_ys: Optional. A `Tensor` or list of tensors the same size as
      `ys` and holding the gradients computed for each y in `ys`.
    name: Optional name to use for grouping all the gradient ops together.
      defaults to 'gradients'.
    colocate_gradients_with_ops: If True, try colocating gradients with
      the corresponding op.
    gate_gradients: If True, add a tuple around the gradients returned
      for an operations.  This avoids some race conditions.
    aggregation_method: Specifies the method used to combine gradient terms.
      Accepted values are constants defined in the class `AggregationMethod`.

  Returns:
    A list of `sum(dy/dx)` for each x in `xs`.

  Raises:
    LookupError: if one of the operations between `x` and `y` does not
      have a registered gradient function.
    ValueError: if the arguments are invalid.

  """
  ys = _AsList(ys)
  xs = _AsList(xs)
  if grad_ys is None:
    grad_ys = [None] * len(ys)
  else:
    grad_ys = _AsList(grad_ys)

  with ops.name_scope(name, "gradients", ys + xs + grad_ys):
    ys = ops.convert_n_to_tensor_or_indexed_slices(ys, name="y")
    xs = ops.convert_n_to_tensor_or_indexed_slices(xs, name="x")
    grad_ys = _DefaultGradYs(grad_ys, ys, colocate_gradients_with_ops)

    # The approach we take here is as follows: Create a list of all ops in the
    # subgraph between the ys and xs.  Visit these ops in reverse order of ids
    # to ensure that when we visit an op the gradients w.r.t its outputs have
    # been collected.  Then aggregate these gradients if needed, call the op's
    # gradient function, and add the generated gradients to the gradients for
    # its input.

    # Initialize the pending count for ops in the connected subgraph from ys
    # to the xs.
    to_ops = [t.op for t in ys]
    from_ops = [t.op for t in xs]
    pending_count, loop_state = _PendingCount(ops.get_default_graph(), to_ops,
                                              from_ops,
                                              colocate_gradients_with_ops)

    # Iterate over the collected ops.
    #
    # grads: op => list of gradients received on each output endpoint of the
    # op.  The gradients for each endpoint are initially collected as a list.
    # When it is time to call the op's gradient function, for each endpoint we
    # aggregate the list of received gradients into a Add() Operation if there
    # is more than one.
    grads = {}

    # Add the initial gradients for the ys.
    for y, grad_y in zip(ys, grad_ys):
      _SetGrad(grads, y, grad_y)

    # Initialize queue with to_ops.
    queue = collections.deque()
    # Add the ops in 'to_ops' into the queue.
    to_ops_set = set()
    for op in to_ops:
      # 'ready' handles the case where one output gradient relies on
      # another output's gradient.
      # pylint: disable=protected-access
      ready = (pending_count[op._id] == 0)
      if ready and op._id not in to_ops_set:
        to_ops_set.add(op._id)
        queue.append(op)
      # pylint: enable=protected-access

    if loop_state:
      loop_exits = loop_state.ProcessUnusedLoopExits(pending_count, to_ops_set)
      for y in loop_exits:
        if _IsTrainable(y):
          _SetGrad(grads, y, loop_state.ZerosLikeForExit(y))
          queue.append(y.op)

    # The set of 'from_ops'.
    stop_ops = _StopOps(from_ops, pending_count)
    while queue:
      # generate gradient subgraph for op.
      op = queue.popleft()
      with _maybe_colocate_with(op, colocate_gradients_with_ops):
        if loop_state:
          loop_state.EnterGradWhileContext(op, before=True)
        out_grads = _AggregatedGrads(grads, op, loop_state, aggregation_method)
        if loop_state:
          loop_state.ExitGradWhileContext(op, before=True)

        grad_fn = None
        # pylint: disable=protected-access
        is_func_call = ops.get_default_graph()._is_function(op.type)
        has_out_grads = any(isinstance(g, ops.Tensor) or g for g in out_grads)
        if has_out_grads and (op._id not in stop_ops):
          if is_func_call:
            grad_fn = ops.get_default_graph()._get_function(
                op.type).python_grad_func
            # pylint: enable=protected-access
          else:
            # A grad_fn must be defined, either as a function or as None
            # for ops that do not have gradients.
            try:
              grad_fn = ops.get_gradient_function(op)
            except LookupError:
              raise LookupError(
                  "No gradient defined for operation '%s' (op type: %s)" %
                  (op.name, op.type))
        if loop_state:
          loop_state.EnterGradWhileContext(op, before=False)
        if (grad_fn or is_func_call) and has_out_grads:
          # NOTE: If _AggregatedGrads didn't compute a value for the i'th
          # output, it means that the cost does not depend on output[i],
          # therefore dC/doutput[i] is 0.
          for i, out_grad in enumerate(out_grads):
            if (not isinstance(out_grad, ops.Tensor) and
                not out_grad) and _IsTrainable(op.outputs[i]):
              # Only floating-point outputs get a zero gradient. Gradient
              # functions should ignore the gradient for other outputs.
              if loop_state:
                out_grads[i] = loop_state.ZerosLike(op, i)
              else:
                out_grads[i] = control_flow_ops.ZerosLikeOutsideLoop(op, i)
          with ops.name_scope(op.name + "_grad"):
            # pylint: disable=protected-access
            with ops.get_default_graph()._original_op(op):
              # pylint: enable=protected-access
              if grad_fn:
                # If grad_fn was found, do not use SymbolicGradient even for
                # functions.
                in_grads = grad_fn(op, *out_grads)
              else:
                # For function call ops, we add a 'SymbolicGradient'
                # node to the graph to compute gradients.
                in_grads = _SymGrad(op, out_grads)
              in_grads = _AsList(in_grads)
              _VerifyGeneratedGradients(in_grads, op)
              if gate_gradients and len(
                  [x for x in in_grads if x is not None]) > 1:
                in_grads = control_flow_ops.tuple(in_grads)
          _LogOpGradients(op, out_grads, in_grads)
        else:
          # If no grad_fn is defined or none of out_grads is available,
          # just propagate a list of None backwards.
          in_grads = [None] * len(op.inputs)
        for t_in, in_grad in zip(op.inputs, in_grads):
          if in_grad is not None:
            if isinstance(in_grad, ops.Tensor):
              in_grad.set_shape(t_in.get_shape())
            _SetGrad(grads, t_in, in_grad)
        if loop_state:
          loop_state.ExitGradWhileContext(op, before=False)

      # Update pending count for the inputs of op and enqueue ready ops.
      _UpdatePendingAndEnqueueReady(grads, op, queue, pending_count, loop_state)

  if loop_state:
    loop_state.PostProcessing()
  return [_GetGrad(grads, x) for x in xs]
def _GradientsHelper(ys,
                     xs,
                     grad_ys=None,
                     name="gradients",
                     colocate_gradients_with_ops=False,
                     gate_gradients=False,
                     aggregation_method=None,
                     stop_gradients=None,
                     unconnected_gradients=UnconnectedGradients.NONE,
                     src_graph=None):
  """Implementation of gradients()."""
  if context.executing_eagerly():
    raise RuntimeError("tf.gradients is not supported when eager execution "
                       "is enabled. Use tf.GradientTape instead.")
  if src_graph is None:
    src_graph = ops.get_default_graph()
  try:
    unconnected_gradients = UnconnectedGradients(unconnected_gradients)
  except ValueError:
    raise ValueError(
        "Unknown value for unconnected_gradients: %r" % unconnected_gradients)

  # If src_graph is a _FuncGraph (i.e. a function body), gather it and all
  # ancestor graphs. This is necessary for correctly handling captured values.
  func_graphs = []
  curr_graph = src_graph
  while _IsFunction(curr_graph):
    func_graphs.append(curr_graph)
    if isinstance(curr_graph, FuncGraph):
      curr_graph = curr_graph.outer_graph
    else:
      assert isinstance(curr_graph, framework_function._FuncGraph)  # pylint: disable=protected-access
      curr_graph = curr_graph._outer_graph  # pylint: disable=protected-access

  ys = _AsList(ys)
  xs = _AsList(xs)
  stop_gradients = [] if stop_gradients is None else _AsList(stop_gradients)
  if grad_ys is None:
    grad_ys = [None] * len(ys)
  else:
    grad_ys = _AsList(grad_ys)

  with ops.name_scope(
      name, "gradients",
      list(ys) + list(xs) + list(stop_gradients) + list(grad_ys)) as grad_scope:
    # Get a uid for this call to gradients that can be used to help
    # cluster ops for compilation.
    gradient_uid = ops.get_default_graph().unique_name("uid")
    ys = ops.convert_n_to_tensor_or_indexed_slices(ys, name="y")
    xs = [
        x.handle if resource_variable_ops.is_resource_variable(x) else x
        for x in xs
    ]
    xs = ops.internal_convert_n_to_tensor_or_indexed_slices(
        xs, name="x", as_ref=True)
    grad_ys = _DefaultGradYs(grad_ys, ys, colocate_gradients_with_ops,
                             gradient_uid)

    # The approach we take here is as follows: Create a list of all ops in the
    # subgraph between the ys and xs.  Visit these ops in reverse order of ids
    # to ensure that when we visit an op the gradients w.r.t its outputs have
    # been collected.  Then aggregate these gradients if needed, call the op's
    # gradient function, and add the generated gradients to the gradients for
    # its input.

    # Initialize the pending count for ops in the connected subgraph from ys
    # to the xs.
    to_ops = [t.op for t in ys]
    from_ops = [t.op for t in xs]
    stop_gradient_ops = [t.op for t in stop_gradients]
    reachable_to_ops, pending_count, loop_state = _PendingCount(
        to_ops, from_ops, colocate_gradients_with_ops, func_graphs, xs)

    # Iterate over the collected ops.
    #
    # grads: op => list of gradients received on each output endpoint of the
    # op.  The gradients for each endpoint are initially collected as a list.
    # When it is time to call the op's gradient function, for each endpoint we
    # aggregate the list of received gradients into a Add() Operation if there
    # is more than one.
    grads = {}

    # Add the initial gradients for the ys.
    for y, grad_y in zip(ys, grad_ys):
      _SetGrad(grads, y, grad_y)

    # Initialize queue with to_ops.
    queue = collections.deque()
    # Add the ops in 'to_ops' into the queue.
    to_ops_set = set()
    for op in to_ops:
      # 'ready' handles the case where one output gradient relies on
      # another output's gradient.
      ready = (pending_count[op] == 0)
      if ready and op not in to_ops_set and op in reachable_to_ops:
        to_ops_set.add(op)
        queue.append(op)

    if loop_state:
      loop_exits = loop_state.ProcessUnusedLoopExits(pending_count, to_ops_set)
      for y in loop_exits:
        if IsTrainable(y):
          _SetGrad(grads, y, loop_state.ZerosLikeForExit(y))
          queue.append(y.op)

    stop_ops = _StopOps(from_ops, stop_gradient_ops, pending_count, xs)
    while queue:
      # generate gradient subgraph for op.
      op = queue.popleft()
      with _maybe_colocate_with(op, gradient_uid, colocate_gradients_with_ops):
        if loop_state:
          loop_state.EnterGradWhileContext(op, before=True)
        out_grads = _AggregatedGrads(grads, op, gradient_uid, loop_state,
                                     aggregation_method)
        if loop_state:
          loop_state.ExitGradWhileContext(op, before=True)

        grad_fn = None
        func_call = None
        is_partitioned_call = _IsPartitionedCall(op)
        # pylint: disable=protected-access
        is_func_call = (
            src_graph._is_function(op.type) or is_partitioned_call)
        # pylint: enable=protected-access
        has_out_grads = any(isinstance(g, ops.Tensor) or g for g in out_grads)
        if has_out_grads and (op not in stop_ops):
          try:
            grad_fn = ops.get_gradient_function(op)
          except LookupError:
            if is_func_call:
              if is_partitioned_call:
                func_call = src_graph._get_function(  # pylint: disable=protected-access
                    compat.as_bytes(op.get_attr("f").name))
              else:
                func_call = src_graph._get_function(op.type)  # pylint: disable=protected-access
              # Note that __defun is not set if the graph is
              # imported. If it's set, we prefer to access the original
              # defun.
              func_call = getattr(op, "__defun", func_call)
              grad_fn = func_call.python_grad_func
            else:
              raise LookupError(
                  "No gradient defined for operation '%s' (op type: %s)" %
                  (op.name, op.type))
        if loop_state:
          loop_state.EnterGradWhileContext(op, before=False)

        # NOTE(skyewm): We don't support computing gradients wrt a loop variable
        # unless it's within the context of a single iteration (i.e. the
        # gradient is wrt to the loop parameter in the body function, not wrt or
        # through the initial value). This means if we're in a while loop
        # context, we should never see a switch node from this context.
        # pylint: disable=protected-access
        if (control_flow_util.IsSwitch(op) and
            op._control_flow_context is not None and
            op._control_flow_context.IsWhileContext() and
            op._control_flow_context ==
            ops.get_default_graph()._get_control_flow_context()):
          _RaiseNoGradWrtInitialLoopValError(op, from_ops, xs)
        # pylint: enable=protected-access

        if (grad_fn or is_func_call) and has_out_grads:
          # NOTE: If _AggregatedGrads didn't compute a value for the i'th
          # output, it means that the cost does not depend on output[i],
          # therefore dC/doutput[i] is 0.
          for i, out_grad in enumerate(out_grads):
            if (not isinstance(out_grad, ops.Tensor) and not out_grad) and (
                (not grad_fn and is_func_call) or IsTrainable(op.outputs[i])):
              # Only trainable outputs or outputs for a function call that
              # will use SymbolicGradient get a zero gradient. Gradient
              # functions should ignore the gradient for other outputs.
              # TODO(apassos) gradients of resource handles might be an
              # issue here because of zeros.
              if loop_state:
                out_grads[i] = loop_state.ZerosLike(op, i)
              else:
                out_grads[i] = control_flow_ops.ZerosLikeOutsideLoop(op, i)
          with ops.name_scope(op.name + "_grad"):
            # pylint: disable=protected-access
            with src_graph._original_op(op):
              # pylint: enable=protected-access
              if grad_fn:
                # If grad_fn was found, do not use SymbolicGradient even for
                # functions.
                in_grads = _MaybeCompile(grad_scope, op, func_call,
                                         lambda: grad_fn(op, *out_grads))
              else:
                # For function call ops, we add a 'SymbolicGradient'
                # node to the graph to compute gradients.
                in_grads = _MaybeCompile(grad_scope, op, func_call,
                                         lambda: _SymGrad(op, out_grads))
              in_grads = _AsList(in_grads)
              _VerifyGeneratedGradients(in_grads, op)
              if gate_gradients and len([x for x in in_grads
                                         if x is not None]) > 1:
                with ops.device(None):
                  with ops._colocate_with_for_gradient(  # pylint: disable=protected-access
                      None,
                      gradient_uid,
                      ignore_existing=True):
                    in_grads = control_flow_ops.tuple(in_grads)
          _LogOpGradients(op, out_grads, in_grads)
        else:
          # If no grad_fn is defined or none of out_grads is available,
          # just propagate a list of None backwards.
          in_grads = [None] * len(_NonEagerInputs(op, xs))
        for i, (t_in, in_grad) in enumerate(zip(_NonEagerInputs(op, xs),
                                                in_grads)):
          if in_grad is not None:
            if (isinstance(in_grad, ops.Tensor) and
                t_in.dtype != dtypes.resource):
              try:
                in_grad.set_shape(t_in.get_shape())
              except ValueError:
                raise ValueError(
                    "Incompatible shapes between op input and calculated "
                    "input gradient.  Forward operation: %s.  Input index: %d. "
                    "Original input shape: %s.  "
                    "Calculated input gradient shape: %s" %
                    (op.name, i, t_in.shape, in_grad.shape))
            _SetGrad(grads, t_in, in_grad)
        if loop_state:
          loop_state.ExitGradWhileContext(op, before=False)

      # Update pending count for the inputs of op and enqueue ready ops.
      _UpdatePendingAndEnqueueReady(grads, op, queue, pending_count, loop_state,
                                    xs)

  if loop_state:
    loop_state.PostProcessing()
  return [_GetGrad(grads, x, unconnected_gradients) for x in xs]