Esempio n. 1
0
    def register(self, candidate, name=None):
        """Registers a Python object "candidate" for the given "name".

        Args:
          candidate: The candidate object to add to the registry.
          name: An optional string specifying the registry key for the candidate.
                If None, candidate.__name__ will be used.
        Raises:
          KeyError: If same name is used twice.
        """
        if not name:
            name = candidate.__name__
        if name in self._registry:
            (filename, line_number, function_name,
             _) = (self._registry[name][_LOCATION_TAG])
            raise KeyError(
                "Registering two %s with name '%s' !"
                "(Previous registration was in %s %s:%d)" %
                (self._name, name, function_name, filename, line_number))

        logging.vlog(1, "Registering %s (%s) in %s.", name, candidate,
                     self._name)
        # stack trace is [this_function, Register(), user_function,...]
        # so the user function is #2.
        stack = traceback.extract_stack()
        self._registry[name] = {_TYPE_TAG: candidate, _LOCATION_TAG: stack[2]}
Esempio n. 2
0
def _compute_theoretical_jacobian(x, x_shape, x_data, dy, dy_shape, dx):
    """Computes the theoretical Jacobian for dy/dx.

  Computes the theoretical Jacobian using the ops generated by
  compute_gradient().

  Args:
    x: the tensor "x".
    x_shape: the dimensions of x as a tuple or an array of ints.
    x_data: a numpy parray as the input data for x
    dy: the tensor "dy".
    dy_shape: the dimensions of dy as a tuple or an array of ints.
    dx: Tensor or IndexedSlices representing dx

  Returns:
    A 2-d numpy array representing the Jacobian for dy/dx. It has "x_size" rows
    and "dy_size" columns where "x_size" is the number of elements in x and
    "dy_size" is the number of elements in dy.
  """
    # Complex vectors are treated as vectors of twice as many reals.
    if x.dtype.is_complex:
        x_shape = tuple(x_shape) + (2, )
    dy_factor = 2 if dy.dtype.is_complex else 1

    # To compute the jacobian, we treat x and y as one-dimensional vectors.
    x_size = _product(x_shape)
    x_val_size = _product(x_shape[1:])  # This is used for sparse gradients
    dy_size = _product(dy_shape) * dy_factor

    jacobian = np.zeros((x_size, dy_size),
                        dtype=x.dtype.real_dtype.as_numpy_dtype)
    # For each of the entry of dy, we set this to be 1 and
    # everything else to be 0 and compute the backprop -- this will give us one
    # one column of the Jacobian matrix.
    dy_data = np.zeros(dy_shape, dtype=dy.dtype.as_numpy_dtype)
    dy_data_flat = dy_data.ravel().view(dy.dtype.real_dtype.as_numpy_dtype)
    sess = ops.get_default_session()
    for col in range(dy_size):
        dy_data_flat[col] = 1
        if isinstance(dx, ops.IndexedSlices):
            backprop_indices, backprop_values = sess.run(
                [dx.indices, dx.values], feed_dict={
                    x: x_data,
                    dy: dy_data
                })
            for i, v in zip(backprop_indices, backprop_values):
                r_begin = i * x_val_size
                r_end = r_begin + x_val_size
                jacobian[r_begin:r_end, col] += v.flat
        else:
            assert isinstance(dx, ops.Tensor), "dx = " + str(dx)
            backprop = sess.run(dx, feed_dict={x: x_data, dy: dy_data})
            jacobian[:, col] = backprop.ravel().view(jacobian.dtype)
        dy_data_flat[col] = 0

    logging.vlog(1, "Theoretical Jacobian =\n%s", jacobian)
    return jacobian
Esempio n. 3
0
def _compute_theoretical_jacobian(x, x_shape, x_data, dy, dy_shape, dx):
  """Computes the theoretical Jacobian for dy/dx.

  Computes the theoretical Jacobian using the ops generated by
  compute_gradient().

  Args:
    x: the tensor "x".
    x_shape: the dimensions of x as a tuple or an array of ints.
    x_data: a numpy parray as the input data for x
    dy: the tensor "dy".
    dy_shape: the dimensions of dy as a tuple or an array of ints.
    dx: Tensor or IndexedSlices representing dx

  Returns:
    A 2-d numpy array representing the Jacobian for dy/dx. It has "x_size" rows
    and "dy_size" columns where "x_size" is the number of elements in x and
    "dy_size" is the number of elements in dy.
  """
  # Complex vectors are treated as vectors of twice as many reals.
  if x.dtype.is_complex:
    x_shape = tuple(x_shape) + (2,)
  dy_factor = 2 if dy.dtype.is_complex else 1

  # To compute the jacobian, we treat x and y as one-dimensional vectors.
  x_size = _product(x_shape)
  x_val_size = _product(x_shape[1:])  # This is used for sparse gradients
  dy_size = _product(dy_shape) * dy_factor

  jacobian = np.zeros((x_size, dy_size),
                      dtype=x.dtype.real_dtype.as_numpy_dtype)
  # For each of the entry of dy, we set this to be 1 and
  # everything else to be 0 and compute the backprop -- this will give us one
  # one column of the Jacobian matrix.
  dy_data = np.zeros(dy_shape, dtype=dy.dtype.as_numpy_dtype)
  dy_data_flat = dy_data.ravel().view(dy.dtype.real_dtype.as_numpy_dtype)
  sess = ops.get_default_session()
  for col in range(dy_size):
    dy_data_flat[col] = 1
    if isinstance(dx, ops.IndexedSlices):
      backprop_indices, backprop_values = sess.run(
          [dx.indices, dx.values], feed_dict={x: x_data, dy: dy_data})
      for i, v in zip(backprop_indices, backprop_values):
        r_begin = i * x_val_size
        r_end = r_begin + x_val_size
        jacobian[r_begin:r_end, col] += v.flat
    else:
      assert isinstance(dx, ops.Tensor), "dx = " + str(dx)
      backprop = sess.run(dx, feed_dict={x: x_data, dy: dy_data})
      jacobian[:, col] = backprop.ravel().view(jacobian.dtype)
    dy_data_flat[col] = 0

  logging.vlog(1, "Theoretical Jacobian =\n%s", jacobian)
  return jacobian
Esempio n. 4
0
  def _close_on_stop(self, sess, cancel_op, coord):
    """Close the queue when the Coordinator requests stop.

    Args:
      sess: A Session.
      cancel_op: The Operation to run.
      coord: Coordinator.
    """
    coord.wait_for_stop()
    try:
      sess.run(cancel_op)
    except Exception as e:
      # Intentionally ignore errors from cancel_op.
      logging.vlog(1, "Ignored exception: %s", str(e))
Esempio n. 5
0
    def _close_on_stop(self, sess, cancel_op, coord):
        """Close the queue when the Coordinator requests stop.

    Args:
      sess: A Session.
      cancel_op: The Operation to run.
      coord: Coordinator.
    """
        coord.wait_for_stop()
        try:
            sess.run(cancel_op)
        except Exception as e:
            # Intentionally ignore errors from cancel_op.
            logging.vlog(1, "Ignored exception: %s", str(e))
Esempio n. 6
0
def _compute_numeric_jacobian(x, x_shape, x_data, y, y_shape, delta):
  """Computes the numeric Jacobian for dy/dx.

  Computes the numeric Jacobian by slightly perturbing the inputs and
  measuring the differences on the output.

  Args:
    x: the tensor "x".
    x_shape: the dimensions of x as a tuple or an array of ints.
    x_data: a numpy array as the input data for x
    y: the tensor "y".
    y_shape: the dimensions of y as a tuple or an array of ints.
    delta: the amount of perturbation we give to the input

  Returns:
    A 2-d numpy array representing the Jacobian for dy/dx. It has "x_size" rows
    and "y_size" columns where "x_size" is the number of elements in x and
    "y_size" is the number of elements in y.
  """

  # To compute the jacobian, we treat x and y as one-dimensional vectors
  x_size = _product(x_shape) * (2 if x.dtype.is_complex else 1)
  y_size = _product(y_shape) * (2 if y.dtype.is_complex else 1)
  x_dtype = x.dtype.real_dtype.as_numpy_dtype
  y_dtype = y.dtype.real_dtype.as_numpy_dtype

  # Make sure we have the right types
  x_data = np.asarray(x_data, dtype=x.dtype.as_numpy_dtype)
  scale = np.asarray(1 / (2 * delta), dtype=y_dtype)[()]

  jacobian = np.zeros((x_size, y_size), dtype=x_dtype)
  # For each of the entry of x, we slightly perturbs this by adding and
  # subtracting a delta and then compute difference between the outputs. This
  # will give us one row of the Jacobian matrix.
  for row in range(x_size):
    x_pos = x_data.copy()
    x_pos.ravel().view(x_dtype)[row] += delta
    y_pos = y.eval(feed_dict={x: x_pos})
    x_neg = x_data.copy()
    x_neg.ravel().view(x_dtype)[row] -= delta
    y_neg = y.eval(feed_dict={x: x_neg})
    diff = scale * (y_pos - y_neg)
    jacobian[row, :] = diff.ravel().view(y_dtype)

  logging.vlog(1, "Numeric Jacobian =\n%s", jacobian)
  return jacobian
Esempio n. 7
0
def _compute_numeric_jacobian(x, x_shape, x_data, y, y_shape, delta):
    """Computes the numeric Jacobian for dy/dx.

  Computes the numeric Jacobian by slightly perturbing the inputs and
  measuring the differences on the output.

  Args:
    x: the tensor "x".
    x_shape: the dimensions of x as a tuple or an array of ints.
    x_data: a numpy array as the input data for x
    y: the tensor "y".
    y_shape: the dimensions of y as a tuple or an array of ints.
    delta: the amount of perturbation we give to the input

  Returns:
    A 2-d numpy array representing the Jacobian for dy/dx. It has "x_size" rows
    and "y_size" columns where "x_size" is the number of elements in x and
    "y_size" is the number of elements in y.
  """

    # To compute the jacobian, we treat x and y as one-dimensional vectors
    x_size = _product(x_shape) * (2 if x.dtype.is_complex else 1)
    y_size = _product(y_shape) * (2 if y.dtype.is_complex else 1)
    x_dtype = x.dtype.real_dtype.as_numpy_dtype
    y_dtype = y.dtype.real_dtype.as_numpy_dtype

    # Make sure we have the right types
    x_data = np.asarray(x_data, dtype=x.dtype.as_numpy_dtype)
    scale = np.asarray(2 * delta, dtype=y_dtype)[()]

    jacobian = np.zeros((x_size, y_size), dtype=x_dtype)
    # For each of the entry of x, we slightly perturbs this by adding and
    # subtracting a delta and then compute difference between the outputs. This
    # will give us one row of the Jacobian matrix.
    for row in range(x_size):
        x_pos = x_data.copy()
        x_neg = x_data.copy()
        x_pos.ravel().view(x_dtype)[row] += delta
        y_pos = y.eval(feed_dict={x: x_pos})
        x_neg.ravel().view(x_dtype)[row] -= delta
        y_neg = y.eval(feed_dict={x: x_neg})
        diff = (y_pos - y_neg) / scale
        jacobian[row, :] = diff.ravel().view(y_dtype)

    logging.vlog(1, "Numeric Jacobian =\n%s", jacobian)
    return jacobian
Esempio n. 8
0
    def _run(self, sess, enqueue_op, coord=None):
        """Execute the enqueue op in a loop, close the queue in case of error.

    Args:
      sess: A Session.
      enqueue_op: The Operation to run.
      coord: Optional Coordinator object for reporting errors and checking
        for stop conditions.
    """
        decremented = False
        try:
            while True:
                if coord and coord.should_stop():
                    break
                try:
                    sess.run(enqueue_op)
                except errors.OutOfRangeError:
                    # This exception indicates that a queue was closed.
                    with self._lock:
                        self._runs -= 1
                        decremented = True
                        if self._runs == 0:
                            try:
                                sess.run(self._close_op)
                            except Exception as e:
                                # Intentionally ignore errors from close_op.
                                logging.vlog(1, "Ignored exception: %s",
                                             str(e))
                        return
        except Exception as e:
            # This catches all other exceptions.
            if coord:
                coord.request_stop(e)
            else:
                logging.error("Exception in QueueRunner: %s", str(e))
                with self._lock:
                    self._exceptions_raised.append(e)
                raise
        finally:
            # Make sure we account for all terminations: normal or errors.
            if not decremented:
                with self._lock:
                    self._runs -= 1
Esempio n. 9
0
  def _run(self, sess, enqueue_op, coord=None):
    """Execute the enqueue op in a loop, close the queue in case of error.

    Args:
      sess: A Session.
      enqueue_op: The Operation to run.
      coord: Optional Coordinator object for reporting errors and checking
        for stop conditions.
    """
    decremented = False
    try:
      while True:
        if coord and coord.should_stop():
          break
        try:
          sess.run(enqueue_op)
        except errors.OutOfRangeError:
          # This exception indicates that a queue was closed.
          with self._lock:
            self._runs -= 1
            decremented = True
            if self._runs == 0:
              try:
                sess.run(self._close_op)
              except Exception as e:
                # Intentionally ignore errors from close_op.
                logging.vlog(1, "Ignored exception: %s", str(e))
            return
    except Exception as e:
      # This catches all other exceptions.
      if coord:
        coord.request_stop(e)
      else:
        logging.error("Exception in QueueRunner: %s", str(e))
        with self._lock:
          self._exceptions_raised.append(e)
        raise
    finally:
      # Make sure we account for all terminations: normal or errors.
      if not decremented:
        with self._lock:
          self._runs -= 1
def _ComputeNumericJacobian(x, x_shape, x_data, y, y_shape, delta):
  """Computes the numeric Jacobian for dy/dx.

  Computes the numeric Japcobian by slightly perturbing the inputs and
  measuring the differences on the output.

  Args:
    x: the tensor "x".
    x_shape: the dimensions of x as a tuple or an array of ints.
    x_data: a numpy array as the input data for x
    y: the tensor "y".
    y_shape: the dimensions of y as a tuple or an array of ints.
    delta: the amount of perturbation we give to the input

  Returns:
    A 2-d numpy array representing the Jacobian for dy/dx. It has "x_size" rows
    and "y_size" columns where "x_size" is the number of elements in x and
    "y_size" is the number of elements in y.
  """

  # To compute the jacobian, we treat x and y are one-dimensional vectors
  x_size = _Product(x_shape)
  y_size = _Product(y_shape)

  jacobian = np.zeros((x_size, y_size), dtype=x_data.dtype)
  # For each of the entry of x, we slightly perturbs this by adding and
  # subtracting a delta and then compute difference between the outputs. This
  # will give us one row of the Jacobian matrix.
  for row in range(0, x_size):
    x_pos = x_data.copy()
    x_pos.flat[row] += delta
    y_pos = y.eval(feed_dict={x: x_pos})
    x_neg = x_data.copy()
    x_neg.flat[row] -= delta
    y_neg = y.eval(feed_dict={x: x_neg})
    diff = (y_pos - y_neg) / (2 * delta)
    jacobian[row, :] = diff.reshape(y_size)

  logging.vlog(1, "Numeric Jacobian =\n%s", jacobian)
  return jacobian
Esempio n. 11
0
    def register(self, candidate, name=None):
        """Registers a Python object "candidate" for the given "name".
        Args:
          candidate: The candidate object to add to the registry.
          name: An optional string specifying the registry key for the candidate.
                If None, candidate.__name__ will be used.
        Raises:
          KeyError: If same name is used twice.
        """
        if not name:
            name = candidate.__name__
        if name in self._registry:
            (filename, line_number, function_name, _) = (
                self._registry[name][_LOCATION_TAG])
            raise KeyError("Registering two %s with name '%s' !"
                           "(Previous registration was in %s %s:%d)" %
                           (self._name, name, function_name, filename, line_number))

        logging.vlog(1, "Registering %s (%s) in %s.", name, candidate, self._name)
        # stack trace is [this_function, Register(), user_function,...]
        # so the user function is #2.
        stack = traceback.extract_stack()
        self._registry[name] = {_TYPE_TAG: candidate, _LOCATION_TAG: stack[2]}
Esempio n. 12
0
def _AggregatedGrads(grads, op, loop_state, aggregation_method=None):
    """Get the aggregated gradients for op.

  Args:
    grads: The map of memoized gradients.
    op: The op to get gradients for.
    loop_state: An object for maintaining the state of the while loops in the
                graph. It is of type ControlFlowState. None if the graph
                contains no while loops.
    aggregation_method: Specifies the method used to combine gradient terms.
      Accepted values are constants defined in the class `AggregationMethod`.

  Returns:
    A list of gradients, one per each output of `op`. If the gradients
      for a particular output is a list, this function aggregates it
      before returning.

  Raises:
    TypeError: if the incoming grads are not Tensors or IndexedSlices.
    ValueError: if the arguments are invalid.

  """
    if aggregation_method is None:
        aggregation_method = AggregationMethod.DEFAULT
    if aggregation_method not in [
            AggregationMethod.ADD_N, AggregationMethod.EXPERIMENTAL_TREE,
            AggregationMethod.EXPERIMENTAL_ACCUMULATE_N
    ]:
        raise ValueError("Invalid aggregation_method specified %s." %
                         aggregation_method)
    out_grads = _GetGrads(grads, op)
    for i, out_grad in enumerate(out_grads):
        if loop_state:
            if isinstance(out_grad, (ops.Tensor, ops.IndexedSlices)):
                assert control_flow_ops.IsLoopSwitch(op)
                continue
        # Grads have to be Tensors or IndexedSlices
        if not all([
                isinstance(g, (ops.Tensor, ops.IndexedSlices))
                for g in out_grad if g
        ]):
            raise TypeError("gradients have to be either all Tensors "
                            "or all IndexedSlices")
        # Aggregate multiple gradients, and convert [] to None.
        if out_grad:
            if all([isinstance(g, ops.Tensor) for g in out_grad if g]):
                tensor_shape = _AccumulatorShape(out_grad)
                if len(out_grad) < 2:
                    used = "nop"
                    out_grads[i] = out_grad[0]
                elif (aggregation_method
                      == AggregationMethod.EXPERIMENTAL_ACCUMULATE_N
                      and len(out_grad) > 2
                      and tensor_shape.is_fully_defined()):
                    # The benefit of using AccumulateN is that its inputs can be combined
                    # in any order and this can allow the expression to be evaluated with
                    # a smaller memory footprint.  When used with gpu_allocator_retry,
                    # it is possible to compute a sum of terms which are much larger than
                    # total GPU memory.
                    # AccumulateN can currently only be used if we know the shape for
                    # an accumulator variable.  If this is not known, or if we only have
                    # 2 grads then we fall through to the "tree" case below.
                    used = "accumulate_n"
                    out_grads[i] = math_ops.accumulate_n(out_grad)
                elif aggregation_method in [
                        AggregationMethod.EXPERIMENTAL_TREE,
                        AggregationMethod.EXPERIMENTAL_ACCUMULATE_N
                ]:
                    # Aggregate all gradients by doing pairwise sums: this may
                    # reduce performance, but it can improve memory because the
                    # gradients can be released earlier.
                    #
                    # TODO(vrv): Consider replacing this with a version of
                    # tf.AddN() that eagerly frees its inputs as soon as they are
                    # ready, so the order of this tree does not become a problem.
                    used = "tree"
                    with ops.name_scope(op.name + "_gradient_sum"):
                        running_sum = out_grad[0]
                        for grad in out_grad[1:]:
                            running_sum = math_ops.add_n([running_sum, grad])
                        out_grads[i] = running_sum
                else:
                    used = "add_n"
                    out_grads[i] = math_ops.add_n(out_grad)
                logging.vlog(2, "  _AggregatedGrads %d x %s using %s",
                             len(out_grad), tensor_shape, used)
            else:
                out_grad = math_ops._as_indexed_slices_list(
                    [g for g in out_grad if g])
                out_grad = [_HandleNestedIndexedSlices(x) for x in out_grad]
                # Form IndexedSlices out of the concatenated values and
                # indices.
                out_grads[i] = ops.IndexedSlices(
                    array_ops.concat(0, [x.values for x in out_grad]),
                    array_ops.concat(0, [x.indices for x in out_grad]),
                    out_grad[0].dense_shape)
        else:
            out_grads[i] = []
    return out_grads
Esempio n. 13
0
def gradients(ys,
              xs,
              grad_ys=None,
              name="gradients",
              colocate_gradients_with_ops=False,
              gate_gradients=False,
              aggregation_method=None):
    """Constructs symbolic partial derivatives of `ys` w.r.t. x in `xs`.

  `ys` and `xs` are each a `Tensor` or a list of tensors.  `grad_ys`
  is a list of `Tensor`, holding the gradients received by the
  `ys`. The list must be the same length as `ys`.

  `gradients()` adds ops to the graph to output the partial
  derivatives of `ys` with respect to `xs`.  It returns a list of
  `Tensor` of length `len(xs)` where each tensor is the `sum(dy/dx)`
  for y in `ys`.

  `grad_ys` is a list of tensors of the same length as `ys` that holds
  the initial gradients for each y in `ys`.  When `grad_ys` is None,
  we fill in a tensor of '1's of the shape of y for each y in `ys`.  A
  user can provide their own initial `grad_ys` to compute the
  derivatives using a different initial gradient for each y (e.g., if
  one wanted to weight the gradient differently for each value in
  each y).

  Args:
    ys: A `Tensor` or list of tensors to be differentiated.
    xs: A `Tensor` or list of tensors to be used for differentiation.
    grad_ys: Optional. A `Tensor` or list of tensors the same size as
      `ys` and holding the gradients computed for each y in `ys`.
    name: Optional name to use for grouping all the gradient ops together.
      defaults to 'gradients'.
    colocate_gradients_with_ops: If True, try colocating gradients with
      the corresponding op.
    gate_gradients: If True, add a tuple around the gradients returned
      for an operations.  This avoids some race conditions.
    aggregation_method: Specifies the method used to combine gradient terms.
      Accepted values are constants defined in the class `AggregationMethod`.

  Returns:
    A list of `sum(dy/dx)` for each x in `xs`.

  Raises:
    LookupError: if one of the operations between `x` and `y` does not
      have a registered gradient function.
    ValueError: if the arguments are invalid.

  """
    ys = _AsList(ys)
    xs = _AsList(xs)
    if grad_ys is None:
        grad_ys = [None] * len(ys)
    else:
        grad_ys = _AsList(grad_ys)
    with ops.op_scope(ys + xs + grad_ys, name, "gradients"):
        ys = ops.convert_n_to_tensor_or_indexed_slices(ys, name="y")
        xs = ops.convert_n_to_tensor_or_indexed_slices(xs, name="x")
        grad_ys = _DefaultGradYs(grad_ys, ys, colocate_gradients_with_ops)

        # The approach we take here is as follows: Create a list of all ops in the
        # subgraph between the ys and xs.  Visit these ops in reverse order of ids
        # to ensure that when we visit an op the gradients w.r.t its outputs have
        # been collected.  Then aggregate these gradients if needed, call the op's
        # gradient function, and add the generated gradients to the gradients for
        # its input.

        # Initialize the pending count for ops in the connected subgraph from ys
        # to the xs.
        to_ops = [t.op for t in ys]
        from_ops = [t.op for t in xs]
        pending_count, loop_state = _PendingCount(ops.get_default_graph(),
                                                  to_ops, from_ops)

        # Iterate over the collected ops.
        #
        # grads: op => list of gradients received on each output endpoint of the
        # op.  The gradients for each endpoint are initially collected as a list.
        # When it is time to call the op's gradient function, for each endpoint we
        # aggregate the list of received gradients into a Add() Operation if there
        # is more than one.
        grads = {}

        # Add the initial gradients for the ys.
        for y, grad_y in zip(ys, grad_ys):
            _SetGrad(grads, y, grad_y)

        # Initialize queue with to_ops.
        queue = collections.deque()
        # Add the ops in 'to_ops' into the queue.
        to_ops_set = set()
        for op in to_ops:
            # 'ready' handles the case where one output gradient relies on
            # another output's gradient.
            # pylint: disable=protected-access
            ready = (pending_count[op._id] == 0)
            if ready and op._id not in to_ops_set:
                to_ops_set.add(op._id)
                queue.append(op)

        if loop_state:
            # The "unused" exits of the loops are added to ys. As an example,
            # people often write:
            #         v1, _ = While(p, b, [x1, x2])
            #         result = gradients(v1, x1)
            # The exit node of x2 is not included by the betweenness analysis.
            # But we need it if x2 is involved in computing v1. So we add it
            # back in backprop with a zeros_like gradient.
            loop_exits = loop_state.GetAllLoopExits()
            for y in loop_exits:
                if pending_count[y.op._id] == 0 and y.op._id not in to_ops_set:
                    if _IsFloat(y):
                        # Floating-point outputs get a zero gradient.
                        _SetGrad(grads, y, loop_state.ZerosLikeForExit(y))
                    queue.append(y.op)

        # The set of 'from_ops'.
        stop_ops = _StopOps(from_ops, pending_count)
        while queue:
            # generate gradient subgraph for op.
            op = queue.popleft()
            with ops.device(_GetGradsDevice(op, colocate_gradients_with_ops)):
                if loop_state:
                    loop_state.EnterGradWhileContext(op)
                out_grads = _AggregatedGrads(grads, op, loop_state,
                                             aggregation_method)
                grad_fn = None

                # pylint: disable=protected-access
                is_func_call = ops.get_default_graph()._is_function(op.type)
                # pylint: enable=protected-access

                if not is_func_call and any(
                        out_grads) and op._id not in stop_ops:
                    # pylint: enable=protected-access
                    # A grad_fn must be defined, either as a function or as None
                    # for ops that do not have gradients.
                    try:
                        grad_fn = ops.get_gradient_function(op)
                    except LookupError:
                        raise LookupError(
                            "No gradient defined for operation '%s' (op type: %s)"
                            % (op.name, op.type))
                if (grad_fn or is_func_call) and any(out_grads):
                    # NOTE: If _AggregatedGrads didn't compute a value for the i'th
                    # output, it means that the cost does not depend on output[i],
                    # therefore dC/doutput[i] is 0.
                    for i, out_grad in enumerate(out_grads):
                        if not out_grad and _IsFloat(op.outputs[i]):
                            # Only floating-point outputs get a zero gradient. Gradient
                            # functions should ignore the gradient for other outputs.
                            if loop_state:
                                out_grads[i] = loop_state.ZerosLike(op, i)
                            else:
                                out_grads[i] = array_ops.zeros_like(
                                    op.outputs[i])
                    with ops.name_scope(op.name + "_grad"):
                        # pylint: disable=protected-access
                        with ops.get_default_graph()._original_op(op):
                            # pylint: enable=protected-access
                            wrapped_op = op
                            if loop_state:
                                wrapped_op = loop_state.MakeWrapper(op)
                            if is_func_call:
                                # For function call ops, we add a 'SymbolicGradient'
                                # node to the graph to compute gradients.
                                f_in = [x for x in op.inputs] + out_grads
                                f_types = [x.dtype for x in op.inputs]
                                # pylint: disable=protected-access
                                in_grads = _AsList(
                                    functional_ops._symbolic_gradient(
                                        f_in, f_types, op.type))
                                # pylint: enable=protected-access
                            else:
                                in_grads = _AsList(
                                    grad_fn(wrapped_op, *out_grads))
                            _VerifyGeneratedGradients(in_grads, op)
                            if gate_gradients and len(
                                    tuple(filter(None, in_grads))) > 1:
                                in_grads = control_flow_ops.tuple(in_grads)
                    logging.vlog(1, "Gradient for '" + op.name + "'")
                    logging.vlog(1, "  in  --> %s",
                                 ", ".join([x.name for x in out_grads if x]))
                    logging.vlog(1, "  out --> %s",
                                 ", ".join([x.name for x in in_grads if x]))
                else:
                    # If no grad_fn is defined or none of out_grads is available,
                    # just propagates a list of None backwards.
                    in_grads = [None] * len(op.inputs)
                for t_in, in_grad in zip(op.inputs, in_grads):
                    if in_grad:
                        _SetGrad(grads, t_in, in_grad)
                if loop_state:
                    loop_state.ExitGradWhileContext(op)

            # update pending count for the inputs of op.
            # pylint: disable=protected-access
            for x in op.inputs:
                pending_count[x.op._id] -= 1
                ready = (pending_count[x.op._id] == 0)
                if loop_state and not ready:
                    ready = (pending_count[x.op._id] > 0
                             and control_flow_ops.IsLoopSwitch(x.op))
                if ready:
                    queue.append(x.op)
            for x in op.control_inputs:
                pending_count[x._id] -= 1
                if pending_count[x._id] is 0:
                    queue.append(x)
            # pylint: enable=protected-access
    return [_GetGrad(grads, x) for x in xs]
Esempio n. 14
0
def _AggregatedGrads(grads, op, has_control_flow, aggregation_method=None):
  """Get the aggregated gradients for op.

  Args:
    grads: The map of memoized gradients.
    op: The op to get gradients for.
    has_control_flow: True iff the graph contains control flow ops.
    aggregation_method: Specifies the method used to combine gradient terms.
      Accepted values are constants defined in the class `AggregationMethod`.

  Returns:
    A list of gradients, one per each output of `op`. If the gradients
      for a particular output is a list, this function aggregates it
      before returning.

  Raises:
    TypeError: if the incoming grads are not Tensors or IndexedSlices.
    ValueError: if the arguments are invalid.

  """
  if aggregation_method is None:
    aggregation_method = AggregationMethod.DEFAULT
  if aggregation_method not in [AggregationMethod.ADD_N,
                                AggregationMethod.EXPERIMENTAL_TREE,
                                AggregationMethod.EXPERIMENTAL_ACCUMULATE_N]:
    raise ValueError("Invalid aggregation_method specified.")
  out_grads = _GetGrads(grads, op)
  for i, out_grad in enumerate(out_grads):
    if has_control_flow:
      if isinstance(out_grad, (ops.Tensor, ops.IndexedSlices)):
        assert op.type == "Switch"
        continue
    # Grads have to be Tensors or IndexedSlices
    if not all([isinstance(g, (ops.Tensor, ops.IndexedSlices))
                for g in out_grad if g]):
      raise TypeError("gradients have to be either all Tensors "
                      "or all IndexedSlices")
    # Aggregate multiple gradients, and convert [] to None.
    if out_grad:
      if all([isinstance(g, ops.Tensor) for g in out_grad if g]):
        tensor_shape = _AccumulatorShape(out_grad)
        if len(out_grad) < 2:
          used = "nop"
          out_grads[i] = out_grad[0]
        elif (aggregation_method == AggregationMethod.EXPERIMENTAL_ACCUMULATE_N
              and len(out_grad) > 2 and tensor_shape.is_fully_defined()):
          # The benefit of using AccumulateN is that its inputs can be combined
          # in any order and this can allow the expression to be evaluated with
          # a smaller memory footprint.  When used with gpu_allocator_retry,
          # it is possible to compute a sum of terms which are much larger than
          # total GPU memory.
          # AccumulateN can currently only be used if we know the shape for
          # an accumulator variable.  If this is not known, or if we only have
          # 2 grads then we fall through to the "tree" case below.
          used = "accumulate_n"
          out_grads[i] = math_ops.accumulate_n(out_grad)
        elif aggregation_method in [AggregationMethod.EXPERIMENTAL_TREE,
                                    AggregationMethod.EXPERIMENTAL_ACCUMULATE_N
                                   ]:
          # Aggregate all gradients by doing pairwise sums: this may
          # reduce performance, but it can improve memory because the
          # gradients can be released earlier.
          #
          # TODO(vrv): Consider replacing this with a version of
          # tf.AddN() that eagerly frees its inputs as soon as they are
          # ready, so the order of this tree does not become a problem.
          used = "tree"
          with ops.name_scope(op.name + "_gradient_sum"):
            running_sum = out_grad[0]
            for grad in out_grad[1:]:
              running_sum = math_ops.add_n([running_sum, grad])
            out_grads[i] = running_sum
        else:
          used = "add_n"
          out_grads[i] = math_ops.add_n(out_grad)
        logging.vlog(2, "  _AggregatedGrads %d x %s using %s", len(out_grad),
                     tensor_shape, used)
      else:
        out_grad = math_ops._as_indexed_slices_list([g for g in out_grad if g])
        out_grad = [_HandleNestedIndexedSlices(x) for x in out_grad]
        # Form IndexedSlices out of the concatenated values and
        # indices.
        out_grads[i] = ops.IndexedSlices(
            array_ops.concat(0, [x.values for x in out_grad]),
            array_ops.concat(0, [x.indices
                                 for x in out_grad]), out_grad[0].dense_shape)
    else:
      out_grads[i] = []
  return out_grads
Esempio n. 15
0
def gradients(ys,
              xs,
              grad_ys=None,
              name="gradients",
              colocate_gradients_with_ops=False,
              gate_gradients=False,
              aggregation_method=None):
  """Constructs symbolic partial derivatives of `ys` w.r.t. x in `xs`.

  `ys` and `xs` are each a `Tensor` or a list of tensors.  `grad_ys`
  is a list of `Tensor`, holding the gradients received by the
  `ys`. The list must be the same length as `ys`.

  `gradients()` adds ops to the graph to output the partial
  derivatives of `ys` with respect to `xs`.  It returns a list of
  `Tensor` of length `len(xs)` where each tensor is the `sum(dy/dx)`
  for y in `ys`.

  `grad_ys` is a list of tensors of the same length as `ys` that holds
  the initial gradients for each y in `ys`.  When `grad_ys` is None,
  we fill in a tensor of '1's of the shape of y for each y in `ys`.  A
  user can provide their own initial `grad_ys` to compute the
  derivatives using a different initial gradient for each y (e.g., if
  one wanted to weight the gradient differently for each value in
  each y).

  Args:
    ys: A `Tensor` or list of tensors to be differentiated.
    xs: A `Tensor` or list of tensors to be used for differentiation.
    grad_ys: Optional. A `Tensor` or list of tensors the same size as
      `ys` and holding the gradients computed for each y in `ys`.
    name: Optional name to use for grouping all the gradient ops together.
      defaults to 'gradients'.
    colocate_gradients_with_ops: If True, try colocating gradients with
      the corresponding op.
    gate_gradients: If True, add a tuple around the gradients returned
      for an operations.  This avoids some race conditions.
    aggregation_method: Specifies the method used to combine gradient terms.
      Accepted values are constants defined in the class `AggregationMethod`.

  Returns:
    A list of `sum(dy/dx)` for each x in `xs`.

  Raises:
    LookupError: if one of the operations between `x` and `y` does not
      have a registered gradient function.
    ValueError: if the arguments are invalid.

  """
  ys = _AsList(ys)
  xs = _AsList(xs)
  if grad_ys is None:
    grad_ys = [None] * len(ys)
  else:
    grad_ys = _AsList(grad_ys)
  with ops.op_scope(ys + xs + grad_ys, name, "gradients"):
    ys = ops.convert_n_to_tensor_or_indexed_slices(ys, name="y")
    xs = ops.convert_n_to_tensor_or_indexed_slices(xs, name="x")
    grad_ys = _DefaultGradYs(grad_ys, ys, colocate_gradients_with_ops)

    # The approach we take here is as follows: Create a list of all ops in the
    # subgraph between the ys and xs.  Visit these ops in reverse order of ids
    # to ensure that when we visit an op the gradients w.r.t its outputs have
    # been collected.  Then aggregate these gradients if needed, call the op's
    # gradient function, and add the generated gradients to the gradients for
    # its input.

    # Initialize the pending count for ops in the connected subgraph from ys
    # to the xs.
    to_ops = [t.op for t in ys]
    from_ops = [t.op for t in xs]
    pending_count, has_control_flow = _PendingCount(ops.get_default_graph(),
                                                    to_ops, from_ops)

    # Iterate over the collected ops.
    #
    # grads: op => list of gradients received on each output endpoint of the
    # op.  The gradients for each endpoint are initially collected as a list.
    # When it is time to call the op's gradient function, for each endpoint we
    # aggregate the list of received gradients into a Add() Operation if there
    # is more than one.
    grads = {}

    # Add the initial gradients for the ys.
    for y, grad_y in zip(ys, grad_ys):
      _SetGrad(grads, y, grad_y)

    # Initialize queue with to_ops.
    queue = collections.deque()
    # Add the ops in 'to_ops' into the queue.
    to_ops_set = set()
    for op in to_ops:
      # 'ready' handles the case where one output gradient relies on
      # another output's gradient.
      ready = (pending_count[op._id] == 0)
      if ready and op._id not in to_ops_set:  # pylint: disable=protected-access
        to_ops_set.add(op._id)
        queue.append(op)
    # The set of 'from_ops'.
    stop_ops = _StopOps(from_ops, pending_count)
    while queue:
      # generate gradient subgraph for op.
      op = queue.popleft()
      with ops.device(_GetGradsDevice(op, colocate_gradients_with_ops)):
        if has_control_flow:
          control_flow_ops.EnterGradWhileContext(op)
        out_grads = _AggregatedGrads(grads, op, has_control_flow,
                                     aggregation_method)
        grad_fn = None
        if any(out_grads) and op._id not in stop_ops:
          # A grad_fn must be defined, either as a function or as None
          # for ops that do not have gradients.
          try:
            grad_fn = ops.get_gradient_function(op)
          except LookupError:
            raise LookupError(
                "No gradient defined for operation '%s' (op type: %s)" %
                (op.name, op.type))
        if grad_fn and any(out_grads):
          # NOTE: If _AggregatedGrads didn't compute a value for the i'th
          # output, it means that the cost does not depend on output[i],
          # therefore dC/doutput[i] is 0.
          for i, out_grad in enumerate(out_grads):
            if (not out_grad and
                dtypes.as_dtype(op.outputs[i].dtype).base_dtype in
                (dtypes.float32, dtypes.float64)):
              # Only floating-point outputs get a zero gradient. Gradient
              # functions should ignore the gradient for other outputs.
              out_grads[i] = array_ops.zeros_like(op.outputs[i])
          with ops.name_scope(op.name + "_grad"):
            # pylint: disable=protected-access
            with ops.get_default_graph()._original_op(op):
              # pylint: enable=protected-access
              op_wrapper = op
              if has_control_flow:
                op_wrapper = control_flow_ops.MakeWrapper(op)
              in_grads = _AsList(grad_fn(op_wrapper, *out_grads))
              _VerifyGeneratedGradients(in_grads, op)
              if gate_gradients and len(in_grads) > 1:
                in_grads = control_flow_ops.tuple(in_grads)
          logging.vlog(1, "Gradient for '" + op.name + "'")
          logging.vlog(1, "  in  --> %s",
                       ", ".join([x.name for x in out_grads if x]))
          logging.vlog(1, "  out --> %s",
                       ", ".join([x.name for x in in_grads if x]))
        else:
          # If no grad_fn is defined or none of out_grads is available,
          # just propagates a list of None backwards.
          in_grads = [None] * len(op.inputs)
        for t_in, in_grad in zip(op.inputs, in_grads):
          if in_grad:
            _SetGrad(grads, t_in, in_grad)
        if has_control_flow:
          control_flow_ops.ExitGradWhileContext(op)

      # update pending count for the inputs of op.
      for x in op.inputs:
        pending_count[x.op._id] -= 1
        ready = (pending_count[x.op._id] == 0)
        if has_control_flow and not ready:
          ready = (pending_count[x.op._id] > 0 and
                   control_flow_ops.IsLoopSwitch(x.op))
        if ready:
          queue.append(x.op)
      for x in op.control_inputs:
        pending_count[x._id] -= 1
        if pending_count[x._id] is 0:
          queue.append(x)
  return [_GetGrad(grads, x) for x in xs]
Esempio n. 16
0
def gradients(ys,
              xs,
              grad_ys=None,
              name="gradients",
              colocate_gradients_with_ops=False,
              gate_gradients=False,
              aggregation_method=None):
  """Constructs symbolic partial derivatives of sum of `ys` w.r.t. x in `xs`.

  `ys` and `xs` are each a `Tensor` or a list of tensors.  `grad_ys`
  is a list of `Tensor`, holding the gradients received by the
  `ys`. The list must be the same length as `ys`.

  `gradients()` adds ops to the graph to output the partial
  derivatives of `ys` with respect to `xs`.  It returns a list of
  `Tensor` of length `len(xs)` where each tensor is the `sum(dy/dx)`
  for y in `ys`.

  `grad_ys` is a list of tensors of the same length as `ys` that holds
  the initial gradients for each y in `ys`.  When `grad_ys` is None,
  we fill in a tensor of '1's of the shape of y for each y in `ys`.  A
  user can provide their own initial `grad_ys` to compute the
  derivatives using a different initial gradient for each y (e.g., if
  one wanted to weight the gradient differently for each value in
  each y).

  Args:
    ys: A `Tensor` or list of tensors to be differentiated.
    xs: A `Tensor` or list of tensors to be used for differentiation.
    grad_ys: Optional. A `Tensor` or list of tensors the same size as
      `ys` and holding the gradients computed for each y in `ys`.
    name: Optional name to use for grouping all the gradient ops together.
      defaults to 'gradients'.
    colocate_gradients_with_ops: If True, try colocating gradients with
      the corresponding op.
    gate_gradients: If True, add a tuple around the gradients returned
      for an operations.  This avoids some race conditions.
    aggregation_method: Specifies the method used to combine gradient terms.
      Accepted values are constants defined in the class `AggregationMethod`.

  Returns:
    A list of `sum(dy/dx)` for each x in `xs`.

  Raises:
    LookupError: if one of the operations between `x` and `y` does not
      have a registered gradient function.
    ValueError: if the arguments are invalid.

  """
  ys = _AsList(ys)
  xs = _AsList(xs)
  if grad_ys is None:
    grad_ys = [None] * len(ys)
  else:
    grad_ys = _AsList(grad_ys)
  with ops.op_scope(ys + xs + grad_ys, name, "gradients"):
    ys = ops.convert_n_to_tensor_or_indexed_slices(ys, name="y")
    xs = ops.convert_n_to_tensor_or_indexed_slices(xs, name="x")
    grad_ys = _DefaultGradYs(grad_ys, ys, colocate_gradients_with_ops)

    # The approach we take here is as follows: Create a list of all ops in the
    # subgraph between the ys and xs.  Visit these ops in reverse order of ids
    # to ensure that when we visit an op the gradients w.r.t its outputs have
    # been collected.  Then aggregate these gradients if needed, call the op's
    # gradient function, and add the generated gradients to the gradients for
    # its input.

    # Initialize the pending count for ops in the connected subgraph from ys
    # to the xs.
    to_ops = [t.op for t in ys]
    from_ops = [t.op for t in xs]
    pending_count, loop_state = _PendingCount(ops.get_default_graph(),
                                              to_ops, from_ops)

    # Iterate over the collected ops.
    #
    # grads: op => list of gradients received on each output endpoint of the
    # op.  The gradients for each endpoint are initially collected as a list.
    # When it is time to call the op's gradient function, for each endpoint we
    # aggregate the list of received gradients into a Add() Operation if there
    # is more than one.
    grads = {}

    # Add the initial gradients for the ys.
    for y, grad_y in zip(ys, grad_ys):
      _SetGrad(grads, y, grad_y)

    # Initialize queue with to_ops.
    queue = collections.deque()
    # Add the ops in 'to_ops' into the queue.
    to_ops_set = set()
    for op in to_ops:
      # 'ready' handles the case where one output gradient relies on
      # another output's gradient.
      # pylint: disable=protected-access
      ready = (pending_count[op._id] == 0)
      if ready and op._id not in to_ops_set:
        to_ops_set.add(op._id)
        queue.append(op)

    if loop_state:
      # The "unused" exits of the loops are added to ys. As an example,
      # people often write:
      #         v1, _ = While(p, b, [x1, x2])
      #         result = gradients(v1, x1)
      # The exit node of x2 is not included by the betweenness analysis.
      # But we need it if x2 is involved in computing v1. So we add it
      # back in backprop with a zeros_like gradient.
      loop_exits = loop_state.GetAllLoopExits()
      for y in loop_exits:
        if pending_count[y.op._id] == 0 and y.op._id not in to_ops_set:
          if _IsFloat(y):
            # Floating-point outputs get a zero gradient.
            _SetGrad(grads, y, loop_state.ZerosLikeForExit(y))
          queue.append(y.op)

    # The set of 'from_ops'.
    stop_ops = _StopOps(from_ops, pending_count)
    while queue:
      # generate gradient subgraph for op.
      op = queue.popleft()
      with _maybe_colocate_with(op, colocate_gradients_with_ops):
        if loop_state:
          loop_state.EnterGradWhileContext(op, before=True)
        out_grads = _AggregatedGrads(grads, op, loop_state, aggregation_method)
        if loop_state:
          loop_state.ExitGradWhileContext(op, before=True)

        grad_fn = None
        # pylint: disable=protected-access
        is_func_call = ops.get_default_graph()._is_function(op.type)
        if not is_func_call and any(
            isinstance(g, ops.Tensor) or g for g in out_grads) and (
                op._id not in stop_ops):
          # pylint: enable=protected-access
          # A grad_fn must be defined, either as a function or as None
          # for ops that do not have gradients.
          try:
            grad_fn = ops.get_gradient_function(op)
          except LookupError:
            raise LookupError(
                "No gradient defined for operation '%s' (op type: %s)" %
                (op.name, op.type))

        if loop_state:
          loop_state.EnterGradWhileContext(op, before=False)
        if (grad_fn or is_func_call) and any(
            isinstance(g, ops.Tensor) or g for g in out_grads):
          # NOTE: If _AggregatedGrads didn't compute a value for the i'th
          # output, it means that the cost does not depend on output[i],
          # therefore dC/doutput[i] is 0.
          for i, out_grad in enumerate(out_grads):
            if (not isinstance(out_grad, ops.Tensor)
                and not out_grad) and _IsFloat(op.outputs[i]):
              # Only floating-point outputs get a zero gradient. Gradient
              # functions should ignore the gradient for other outputs.
              if loop_state:
                out_grads[i] = loop_state.ZerosLike(op, i)
              else:
                out_grads[i] = control_flow_ops.ZerosLikeOutsideLoop(op, i)
          with ops.name_scope(op.name + "_grad"):
            # pylint: disable=protected-access
            with ops.get_default_graph()._original_op(op):
              # pylint: enable=protected-access
              if is_func_call:
                # For function call ops, we add a 'SymbolicGradient'
                # node to the graph to compute gradients.
                f_in = [x for x in op.inputs] + out_grads
                f_types = [x.dtype for x in op.inputs]
                # pylint: disable=protected-access
                in_grads = _AsList(functional_ops._symbolic_gradient(
                    f_in, f_types, op.type))
                # pylint: enable=protected-access
              else:
                in_grads = _AsList(grad_fn(op, *out_grads))
              _VerifyGeneratedGradients(in_grads, op)
              if gate_gradients and len(
                  [x for x in in_grads if x is not None]) > 1:
                in_grads = control_flow_ops.tuple(in_grads)
          logging.vlog(1, "Gradient for '" + op.name + "'")
          def _FilterGrad(x):
            if x is None:
              return False
            if isinstance(x, (list, tuple)):
              return bool(x)
            else:
              return True
          logging.vlog(1, "  in  --> %s",
                       ", ".join([x.name for x in out_grads if _FilterGrad(x)]))
          logging.vlog(1, "  out --> %s",
                       ", ".join([x.name for x in in_grads if _FilterGrad(x)]))
        else:
          # If no grad_fn is defined or none of out_grads is available,
          # just propagates a list of None backwards.
          in_grads = [None] * len(op.inputs)
        for t_in, in_grad in zip(op.inputs, in_grads):
          if in_grad is not None:
            _SetGrad(grads, t_in, in_grad)
        if loop_state:
          loop_state.ExitGradWhileContext(op, before=False)

      # update pending count for the inputs of op.
      # pylint: disable=protected-access
      for x in op.inputs:
        pending_count[x.op._id] -= 1
        ready = (pending_count[x.op._id] == 0)
        if loop_state and not ready:
          ready = (pending_count[x.op._id] > 0 and
                   control_flow_ops.IsLoopSwitch(x.op))
        if ready:
          queue.append(x.op)
      for x in op.control_inputs:
        pending_count[x._id] -= 1
        if pending_count[x._id] is 0:
          queue.append(x)
      # pylint: enable=protected-access
  return [_GetGrad(grads, x) for x in xs]